1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This file implements the LegalizerHelper class to legalize 10 /// individual instructions and the LegalizeMachineIR wrapper pass for the 11 /// primary legalization. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 16 #include "llvm/CodeGen/GlobalISel/CallLowering.h" 17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" 18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 19 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h" 20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 21 #include "llvm/CodeGen/GlobalISel/Utils.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/CodeGen/TargetFrameLowering.h" 24 #include "llvm/CodeGen/TargetInstrInfo.h" 25 #include "llvm/CodeGen/TargetLowering.h" 26 #include "llvm/CodeGen/TargetOpcodes.h" 27 #include "llvm/CodeGen/TargetSubtargetInfo.h" 28 #include "llvm/IR/Instructions.h" 29 #include "llvm/Support/Debug.h" 30 #include "llvm/Support/MathExtras.h" 31 #include "llvm/Support/raw_ostream.h" 32 #include "llvm/Target/TargetMachine.h" 33 34 #define DEBUG_TYPE "legalizer" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace MIPatternMatch; 39 40 /// Try to break down \p OrigTy into \p NarrowTy sized pieces. 41 /// 42 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy, 43 /// with any leftover piece as type \p LeftoverTy 44 /// 45 /// Returns -1 in the first element of the pair if the breakdown is not 46 /// satisfiable. 47 static std::pair<int, int> 48 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) { 49 assert(!LeftoverTy.isValid() && "this is an out argument"); 50 51 unsigned Size = OrigTy.getSizeInBits(); 52 unsigned NarrowSize = NarrowTy.getSizeInBits(); 53 unsigned NumParts = Size / NarrowSize; 54 unsigned LeftoverSize = Size - NumParts * NarrowSize; 55 assert(Size > NarrowSize); 56 57 if (LeftoverSize == 0) 58 return {NumParts, 0}; 59 60 if (NarrowTy.isVector()) { 61 unsigned EltSize = OrigTy.getScalarSizeInBits(); 62 if (LeftoverSize % EltSize != 0) 63 return {-1, -1}; 64 LeftoverTy = LLT::scalarOrVector( 65 ElementCount::getFixed(LeftoverSize / EltSize), EltSize); 66 } else { 67 LeftoverTy = LLT::scalar(LeftoverSize); 68 } 69 70 int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits(); 71 return std::make_pair(NumParts, NumLeftover); 72 } 73 74 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) { 75 76 if (!Ty.isScalar()) 77 return nullptr; 78 79 switch (Ty.getSizeInBits()) { 80 case 16: 81 return Type::getHalfTy(Ctx); 82 case 32: 83 return Type::getFloatTy(Ctx); 84 case 64: 85 return Type::getDoubleTy(Ctx); 86 case 80: 87 return Type::getX86_FP80Ty(Ctx); 88 case 128: 89 return Type::getFP128Ty(Ctx); 90 default: 91 return nullptr; 92 } 93 } 94 95 LegalizerHelper::LegalizerHelper(MachineFunction &MF, 96 GISelChangeObserver &Observer, 97 MachineIRBuilder &Builder) 98 : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()), 99 LI(*MF.getSubtarget().getLegalizerInfo()), 100 TLI(*MF.getSubtarget().getTargetLowering()) { } 101 102 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI, 103 GISelChangeObserver &Observer, 104 MachineIRBuilder &B) 105 : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI), 106 TLI(*MF.getSubtarget().getTargetLowering()) { } 107 108 LegalizerHelper::LegalizeResult 109 LegalizerHelper::legalizeInstrStep(MachineInstr &MI, 110 LostDebugLocObserver &LocObserver) { 111 LLVM_DEBUG(dbgs() << "Legalizing: " << MI); 112 113 MIRBuilder.setInstrAndDebugLoc(MI); 114 115 if (MI.getOpcode() == TargetOpcode::G_INTRINSIC || 116 MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS) 117 return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize; 118 auto Step = LI.getAction(MI, MRI); 119 switch (Step.Action) { 120 case Legal: 121 LLVM_DEBUG(dbgs() << ".. Already legal\n"); 122 return AlreadyLegal; 123 case Libcall: 124 LLVM_DEBUG(dbgs() << ".. Convert to libcall\n"); 125 return libcall(MI, LocObserver); 126 case NarrowScalar: 127 LLVM_DEBUG(dbgs() << ".. Narrow scalar\n"); 128 return narrowScalar(MI, Step.TypeIdx, Step.NewType); 129 case WidenScalar: 130 LLVM_DEBUG(dbgs() << ".. Widen scalar\n"); 131 return widenScalar(MI, Step.TypeIdx, Step.NewType); 132 case Bitcast: 133 LLVM_DEBUG(dbgs() << ".. Bitcast type\n"); 134 return bitcast(MI, Step.TypeIdx, Step.NewType); 135 case Lower: 136 LLVM_DEBUG(dbgs() << ".. Lower\n"); 137 return lower(MI, Step.TypeIdx, Step.NewType); 138 case FewerElements: 139 LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n"); 140 return fewerElementsVector(MI, Step.TypeIdx, Step.NewType); 141 case MoreElements: 142 LLVM_DEBUG(dbgs() << ".. Increase number of elements\n"); 143 return moreElementsVector(MI, Step.TypeIdx, Step.NewType); 144 case Custom: 145 LLVM_DEBUG(dbgs() << ".. Custom legalization\n"); 146 return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize; 147 default: 148 LLVM_DEBUG(dbgs() << ".. Unable to legalize\n"); 149 return UnableToLegalize; 150 } 151 } 152 153 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts, 154 SmallVectorImpl<Register> &VRegs) { 155 for (int i = 0; i < NumParts; ++i) 156 VRegs.push_back(MRI.createGenericVirtualRegister(Ty)); 157 MIRBuilder.buildUnmerge(VRegs, Reg); 158 } 159 160 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy, 161 LLT MainTy, LLT &LeftoverTy, 162 SmallVectorImpl<Register> &VRegs, 163 SmallVectorImpl<Register> &LeftoverRegs) { 164 assert(!LeftoverTy.isValid() && "this is an out argument"); 165 166 unsigned RegSize = RegTy.getSizeInBits(); 167 unsigned MainSize = MainTy.getSizeInBits(); 168 unsigned NumParts = RegSize / MainSize; 169 unsigned LeftoverSize = RegSize - NumParts * MainSize; 170 171 // Use an unmerge when possible. 172 if (LeftoverSize == 0) { 173 for (unsigned I = 0; I < NumParts; ++I) 174 VRegs.push_back(MRI.createGenericVirtualRegister(MainTy)); 175 MIRBuilder.buildUnmerge(VRegs, Reg); 176 return true; 177 } 178 179 if (MainTy.isVector()) { 180 unsigned EltSize = MainTy.getScalarSizeInBits(); 181 if (LeftoverSize % EltSize != 0) 182 return false; 183 LeftoverTy = LLT::scalarOrVector( 184 ElementCount::getFixed(LeftoverSize / EltSize), EltSize); 185 } else { 186 LeftoverTy = LLT::scalar(LeftoverSize); 187 } 188 189 // For irregular sizes, extract the individual parts. 190 for (unsigned I = 0; I != NumParts; ++I) { 191 Register NewReg = MRI.createGenericVirtualRegister(MainTy); 192 VRegs.push_back(NewReg); 193 MIRBuilder.buildExtract(NewReg, Reg, MainSize * I); 194 } 195 196 for (unsigned Offset = MainSize * NumParts; Offset < RegSize; 197 Offset += LeftoverSize) { 198 Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy); 199 LeftoverRegs.push_back(NewReg); 200 MIRBuilder.buildExtract(NewReg, Reg, Offset); 201 } 202 203 return true; 204 } 205 206 void LegalizerHelper::insertParts(Register DstReg, 207 LLT ResultTy, LLT PartTy, 208 ArrayRef<Register> PartRegs, 209 LLT LeftoverTy, 210 ArrayRef<Register> LeftoverRegs) { 211 if (!LeftoverTy.isValid()) { 212 assert(LeftoverRegs.empty()); 213 214 if (!ResultTy.isVector()) { 215 MIRBuilder.buildMerge(DstReg, PartRegs); 216 return; 217 } 218 219 if (PartTy.isVector()) 220 MIRBuilder.buildConcatVectors(DstReg, PartRegs); 221 else 222 MIRBuilder.buildBuildVector(DstReg, PartRegs); 223 return; 224 } 225 226 SmallVector<Register> GCDRegs; 227 LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy); 228 for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs)) 229 extractGCDType(GCDRegs, GCDTy, PartReg); 230 LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs); 231 buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs); 232 } 233 234 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs. 235 static void getUnmergeResults(SmallVectorImpl<Register> &Regs, 236 const MachineInstr &MI) { 237 assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES); 238 239 const int StartIdx = Regs.size(); 240 const int NumResults = MI.getNumOperands() - 1; 241 Regs.resize(Regs.size() + NumResults); 242 for (int I = 0; I != NumResults; ++I) 243 Regs[StartIdx + I] = MI.getOperand(I).getReg(); 244 } 245 246 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, 247 LLT GCDTy, Register SrcReg) { 248 LLT SrcTy = MRI.getType(SrcReg); 249 if (SrcTy == GCDTy) { 250 // If the source already evenly divides the result type, we don't need to do 251 // anything. 252 Parts.push_back(SrcReg); 253 } else { 254 // Need to split into common type sized pieces. 255 auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg); 256 getUnmergeResults(Parts, *Unmerge); 257 } 258 } 259 260 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy, 261 LLT NarrowTy, Register SrcReg) { 262 LLT SrcTy = MRI.getType(SrcReg); 263 LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy); 264 extractGCDType(Parts, GCDTy, SrcReg); 265 return GCDTy; 266 } 267 268 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy, 269 SmallVectorImpl<Register> &VRegs, 270 unsigned PadStrategy) { 271 LLT LCMTy = getLCMType(DstTy, NarrowTy); 272 273 int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits(); 274 int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits(); 275 int NumOrigSrc = VRegs.size(); 276 277 Register PadReg; 278 279 // Get a value we can use to pad the source value if the sources won't evenly 280 // cover the result type. 281 if (NumOrigSrc < NumParts * NumSubParts) { 282 if (PadStrategy == TargetOpcode::G_ZEXT) 283 PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0); 284 else if (PadStrategy == TargetOpcode::G_ANYEXT) 285 PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0); 286 else { 287 assert(PadStrategy == TargetOpcode::G_SEXT); 288 289 // Shift the sign bit of the low register through the high register. 290 auto ShiftAmt = 291 MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1); 292 PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0); 293 } 294 } 295 296 // Registers for the final merge to be produced. 297 SmallVector<Register, 4> Remerge(NumParts); 298 299 // Registers needed for intermediate merges, which will be merged into a 300 // source for Remerge. 301 SmallVector<Register, 4> SubMerge(NumSubParts); 302 303 // Once we've fully read off the end of the original source bits, we can reuse 304 // the same high bits for remaining padding elements. 305 Register AllPadReg; 306 307 // Build merges to the LCM type to cover the original result type. 308 for (int I = 0; I != NumParts; ++I) { 309 bool AllMergePartsArePadding = true; 310 311 // Build the requested merges to the requested type. 312 for (int J = 0; J != NumSubParts; ++J) { 313 int Idx = I * NumSubParts + J; 314 if (Idx >= NumOrigSrc) { 315 SubMerge[J] = PadReg; 316 continue; 317 } 318 319 SubMerge[J] = VRegs[Idx]; 320 321 // There are meaningful bits here we can't reuse later. 322 AllMergePartsArePadding = false; 323 } 324 325 // If we've filled up a complete piece with padding bits, we can directly 326 // emit the natural sized constant if applicable, rather than a merge of 327 // smaller constants. 328 if (AllMergePartsArePadding && !AllPadReg) { 329 if (PadStrategy == TargetOpcode::G_ANYEXT) 330 AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0); 331 else if (PadStrategy == TargetOpcode::G_ZEXT) 332 AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0); 333 334 // If this is a sign extension, we can't materialize a trivial constant 335 // with the right type and have to produce a merge. 336 } 337 338 if (AllPadReg) { 339 // Avoid creating additional instructions if we're just adding additional 340 // copies of padding bits. 341 Remerge[I] = AllPadReg; 342 continue; 343 } 344 345 if (NumSubParts == 1) 346 Remerge[I] = SubMerge[0]; 347 else 348 Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0); 349 350 // In the sign extend padding case, re-use the first all-signbit merge. 351 if (AllMergePartsArePadding && !AllPadReg) 352 AllPadReg = Remerge[I]; 353 } 354 355 VRegs = std::move(Remerge); 356 return LCMTy; 357 } 358 359 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy, 360 ArrayRef<Register> RemergeRegs) { 361 LLT DstTy = MRI.getType(DstReg); 362 363 // Create the merge to the widened source, and extract the relevant bits into 364 // the result. 365 366 if (DstTy == LCMTy) { 367 MIRBuilder.buildMerge(DstReg, RemergeRegs); 368 return; 369 } 370 371 auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs); 372 if (DstTy.isScalar() && LCMTy.isScalar()) { 373 MIRBuilder.buildTrunc(DstReg, Remerge); 374 return; 375 } 376 377 if (LCMTy.isVector()) { 378 unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits(); 379 SmallVector<Register, 8> UnmergeDefs(NumDefs); 380 UnmergeDefs[0] = DstReg; 381 for (unsigned I = 1; I != NumDefs; ++I) 382 UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy); 383 384 MIRBuilder.buildUnmerge(UnmergeDefs, 385 MIRBuilder.buildMerge(LCMTy, RemergeRegs)); 386 return; 387 } 388 389 llvm_unreachable("unhandled case"); 390 } 391 392 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { 393 #define RTLIBCASE_INT(LibcallPrefix) \ 394 do { \ 395 switch (Size) { \ 396 case 32: \ 397 return RTLIB::LibcallPrefix##32; \ 398 case 64: \ 399 return RTLIB::LibcallPrefix##64; \ 400 case 128: \ 401 return RTLIB::LibcallPrefix##128; \ 402 default: \ 403 llvm_unreachable("unexpected size"); \ 404 } \ 405 } while (0) 406 407 #define RTLIBCASE(LibcallPrefix) \ 408 do { \ 409 switch (Size) { \ 410 case 32: \ 411 return RTLIB::LibcallPrefix##32; \ 412 case 64: \ 413 return RTLIB::LibcallPrefix##64; \ 414 case 80: \ 415 return RTLIB::LibcallPrefix##80; \ 416 case 128: \ 417 return RTLIB::LibcallPrefix##128; \ 418 default: \ 419 llvm_unreachable("unexpected size"); \ 420 } \ 421 } while (0) 422 423 switch (Opcode) { 424 case TargetOpcode::G_SDIV: 425 RTLIBCASE_INT(SDIV_I); 426 case TargetOpcode::G_UDIV: 427 RTLIBCASE_INT(UDIV_I); 428 case TargetOpcode::G_SREM: 429 RTLIBCASE_INT(SREM_I); 430 case TargetOpcode::G_UREM: 431 RTLIBCASE_INT(UREM_I); 432 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 433 RTLIBCASE_INT(CTLZ_I); 434 case TargetOpcode::G_FADD: 435 RTLIBCASE(ADD_F); 436 case TargetOpcode::G_FSUB: 437 RTLIBCASE(SUB_F); 438 case TargetOpcode::G_FMUL: 439 RTLIBCASE(MUL_F); 440 case TargetOpcode::G_FDIV: 441 RTLIBCASE(DIV_F); 442 case TargetOpcode::G_FEXP: 443 RTLIBCASE(EXP_F); 444 case TargetOpcode::G_FEXP2: 445 RTLIBCASE(EXP2_F); 446 case TargetOpcode::G_FREM: 447 RTLIBCASE(REM_F); 448 case TargetOpcode::G_FPOW: 449 RTLIBCASE(POW_F); 450 case TargetOpcode::G_FMA: 451 RTLIBCASE(FMA_F); 452 case TargetOpcode::G_FSIN: 453 RTLIBCASE(SIN_F); 454 case TargetOpcode::G_FCOS: 455 RTLIBCASE(COS_F); 456 case TargetOpcode::G_FLOG10: 457 RTLIBCASE(LOG10_F); 458 case TargetOpcode::G_FLOG: 459 RTLIBCASE(LOG_F); 460 case TargetOpcode::G_FLOG2: 461 RTLIBCASE(LOG2_F); 462 case TargetOpcode::G_FCEIL: 463 RTLIBCASE(CEIL_F); 464 case TargetOpcode::G_FFLOOR: 465 RTLIBCASE(FLOOR_F); 466 case TargetOpcode::G_FMINNUM: 467 RTLIBCASE(FMIN_F); 468 case TargetOpcode::G_FMAXNUM: 469 RTLIBCASE(FMAX_F); 470 case TargetOpcode::G_FSQRT: 471 RTLIBCASE(SQRT_F); 472 case TargetOpcode::G_FRINT: 473 RTLIBCASE(RINT_F); 474 case TargetOpcode::G_FNEARBYINT: 475 RTLIBCASE(NEARBYINT_F); 476 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: 477 RTLIBCASE(ROUNDEVEN_F); 478 } 479 llvm_unreachable("Unknown libcall function"); 480 } 481 482 /// True if an instruction is in tail position in its caller. Intended for 483 /// legalizing libcalls as tail calls when possible. 484 static bool isLibCallInTailPosition(MachineInstr &MI, 485 const TargetInstrInfo &TII, 486 MachineRegisterInfo &MRI) { 487 MachineBasicBlock &MBB = *MI.getParent(); 488 const Function &F = MBB.getParent()->getFunction(); 489 490 // Conservatively require the attributes of the call to match those of 491 // the return. Ignore NoAlias and NonNull because they don't affect the 492 // call sequence. 493 AttributeList CallerAttrs = F.getAttributes(); 494 if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex) 495 .removeAttribute(Attribute::NoAlias) 496 .removeAttribute(Attribute::NonNull) 497 .hasAttributes()) 498 return false; 499 500 // It's not safe to eliminate the sign / zero extension of the return value. 501 if (CallerAttrs.hasRetAttr(Attribute::ZExt) || 502 CallerAttrs.hasRetAttr(Attribute::SExt)) 503 return false; 504 505 // Only tail call if the following instruction is a standard return or if we 506 // have a `thisreturn` callee, and a sequence like: 507 // 508 // G_MEMCPY %0, %1, %2 509 // $x0 = COPY %0 510 // RET_ReallyLR implicit $x0 511 auto Next = next_nodbg(MI.getIterator(), MBB.instr_end()); 512 if (Next != MBB.instr_end() && Next->isCopy()) { 513 switch (MI.getOpcode()) { 514 default: 515 llvm_unreachable("unsupported opcode"); 516 case TargetOpcode::G_BZERO: 517 return false; 518 case TargetOpcode::G_MEMCPY: 519 case TargetOpcode::G_MEMMOVE: 520 case TargetOpcode::G_MEMSET: 521 break; 522 } 523 524 Register VReg = MI.getOperand(0).getReg(); 525 if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg()) 526 return false; 527 528 Register PReg = Next->getOperand(0).getReg(); 529 if (!PReg.isPhysical()) 530 return false; 531 532 auto Ret = next_nodbg(Next, MBB.instr_end()); 533 if (Ret == MBB.instr_end() || !Ret->isReturn()) 534 return false; 535 536 if (Ret->getNumImplicitOperands() != 1) 537 return false; 538 539 if (PReg != Ret->getOperand(0).getReg()) 540 return false; 541 542 // Skip over the COPY that we just validated. 543 Next = Ret; 544 } 545 546 if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn()) 547 return false; 548 549 return true; 550 } 551 552 LegalizerHelper::LegalizeResult 553 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name, 554 const CallLowering::ArgInfo &Result, 555 ArrayRef<CallLowering::ArgInfo> Args, 556 const CallingConv::ID CC) { 557 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); 558 559 CallLowering::CallLoweringInfo Info; 560 Info.CallConv = CC; 561 Info.Callee = MachineOperand::CreateES(Name); 562 Info.OrigRet = Result; 563 std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs)); 564 if (!CLI.lowerCall(MIRBuilder, Info)) 565 return LegalizerHelper::UnableToLegalize; 566 567 return LegalizerHelper::Legalized; 568 } 569 570 LegalizerHelper::LegalizeResult 571 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall, 572 const CallLowering::ArgInfo &Result, 573 ArrayRef<CallLowering::ArgInfo> Args) { 574 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); 575 const char *Name = TLI.getLibcallName(Libcall); 576 const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall); 577 return createLibcall(MIRBuilder, Name, Result, Args, CC); 578 } 579 580 // Useful for libcalls where all operands have the same type. 581 static LegalizerHelper::LegalizeResult 582 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, 583 Type *OpType) { 584 auto Libcall = getRTLibDesc(MI.getOpcode(), Size); 585 586 // FIXME: What does the original arg index mean here? 587 SmallVector<CallLowering::ArgInfo, 3> Args; 588 for (unsigned i = 1; i < MI.getNumOperands(); i++) 589 Args.push_back({MI.getOperand(i).getReg(), OpType, 0}); 590 return createLibcall(MIRBuilder, Libcall, 591 {MI.getOperand(0).getReg(), OpType, 0}, Args); 592 } 593 594 LegalizerHelper::LegalizeResult 595 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 596 MachineInstr &MI, LostDebugLocObserver &LocObserver) { 597 auto &Ctx = MIRBuilder.getMF().getFunction().getContext(); 598 599 SmallVector<CallLowering::ArgInfo, 3> Args; 600 // Add all the args, except for the last which is an imm denoting 'tail'. 601 for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) { 602 Register Reg = MI.getOperand(i).getReg(); 603 604 // Need derive an IR type for call lowering. 605 LLT OpLLT = MRI.getType(Reg); 606 Type *OpTy = nullptr; 607 if (OpLLT.isPointer()) 608 OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace()); 609 else 610 OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits()); 611 Args.push_back({Reg, OpTy, 0}); 612 } 613 614 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); 615 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); 616 RTLIB::Libcall RTLibcall; 617 unsigned Opc = MI.getOpcode(); 618 switch (Opc) { 619 case TargetOpcode::G_BZERO: 620 RTLibcall = RTLIB::BZERO; 621 break; 622 case TargetOpcode::G_MEMCPY: 623 RTLibcall = RTLIB::MEMCPY; 624 Args[0].Flags[0].setReturned(); 625 break; 626 case TargetOpcode::G_MEMMOVE: 627 RTLibcall = RTLIB::MEMMOVE; 628 Args[0].Flags[0].setReturned(); 629 break; 630 case TargetOpcode::G_MEMSET: 631 RTLibcall = RTLIB::MEMSET; 632 Args[0].Flags[0].setReturned(); 633 break; 634 default: 635 llvm_unreachable("unsupported opcode"); 636 } 637 const char *Name = TLI.getLibcallName(RTLibcall); 638 639 // Unsupported libcall on the target. 640 if (!Name) { 641 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for " 642 << MIRBuilder.getTII().getName(Opc) << "\n"); 643 return LegalizerHelper::UnableToLegalize; 644 } 645 646 CallLowering::CallLoweringInfo Info; 647 Info.CallConv = TLI.getLibcallCallingConv(RTLibcall); 648 Info.Callee = MachineOperand::CreateES(Name); 649 Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0); 650 Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() && 651 isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI); 652 653 std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs)); 654 if (!CLI.lowerCall(MIRBuilder, Info)) 655 return LegalizerHelper::UnableToLegalize; 656 657 if (Info.LoweredTailCall) { 658 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?"); 659 660 // Check debug locations before removing the return. 661 LocObserver.checkpoint(true); 662 663 // We must have a return following the call (or debug insts) to get past 664 // isLibCallInTailPosition. 665 do { 666 MachineInstr *Next = MI.getNextNode(); 667 assert(Next && 668 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) && 669 "Expected instr following MI to be return or debug inst?"); 670 // We lowered a tail call, so the call is now the return from the block. 671 // Delete the old return. 672 Next->eraseFromParent(); 673 } while (MI.getNextNode()); 674 675 // We expect to lose the debug location from the return. 676 LocObserver.checkpoint(false); 677 } 678 679 return LegalizerHelper::Legalized; 680 } 681 682 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType, 683 Type *FromType) { 684 auto ToMVT = MVT::getVT(ToType); 685 auto FromMVT = MVT::getVT(FromType); 686 687 switch (Opcode) { 688 case TargetOpcode::G_FPEXT: 689 return RTLIB::getFPEXT(FromMVT, ToMVT); 690 case TargetOpcode::G_FPTRUNC: 691 return RTLIB::getFPROUND(FromMVT, ToMVT); 692 case TargetOpcode::G_FPTOSI: 693 return RTLIB::getFPTOSINT(FromMVT, ToMVT); 694 case TargetOpcode::G_FPTOUI: 695 return RTLIB::getFPTOUINT(FromMVT, ToMVT); 696 case TargetOpcode::G_SITOFP: 697 return RTLIB::getSINTTOFP(FromMVT, ToMVT); 698 case TargetOpcode::G_UITOFP: 699 return RTLIB::getUINTTOFP(FromMVT, ToMVT); 700 } 701 llvm_unreachable("Unsupported libcall function"); 702 } 703 704 static LegalizerHelper::LegalizeResult 705 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType, 706 Type *FromType) { 707 RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType); 708 return createLibcall(MIRBuilder, Libcall, 709 {MI.getOperand(0).getReg(), ToType, 0}, 710 {{MI.getOperand(1).getReg(), FromType, 0}}); 711 } 712 713 LegalizerHelper::LegalizeResult 714 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) { 715 LLT LLTy = MRI.getType(MI.getOperand(0).getReg()); 716 unsigned Size = LLTy.getSizeInBits(); 717 auto &Ctx = MIRBuilder.getMF().getFunction().getContext(); 718 719 switch (MI.getOpcode()) { 720 default: 721 return UnableToLegalize; 722 case TargetOpcode::G_SDIV: 723 case TargetOpcode::G_UDIV: 724 case TargetOpcode::G_SREM: 725 case TargetOpcode::G_UREM: 726 case TargetOpcode::G_CTLZ_ZERO_UNDEF: { 727 Type *HLTy = IntegerType::get(Ctx, Size); 728 auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy); 729 if (Status != Legalized) 730 return Status; 731 break; 732 } 733 case TargetOpcode::G_FADD: 734 case TargetOpcode::G_FSUB: 735 case TargetOpcode::G_FMUL: 736 case TargetOpcode::G_FDIV: 737 case TargetOpcode::G_FMA: 738 case TargetOpcode::G_FPOW: 739 case TargetOpcode::G_FREM: 740 case TargetOpcode::G_FCOS: 741 case TargetOpcode::G_FSIN: 742 case TargetOpcode::G_FLOG10: 743 case TargetOpcode::G_FLOG: 744 case TargetOpcode::G_FLOG2: 745 case TargetOpcode::G_FEXP: 746 case TargetOpcode::G_FEXP2: 747 case TargetOpcode::G_FCEIL: 748 case TargetOpcode::G_FFLOOR: 749 case TargetOpcode::G_FMINNUM: 750 case TargetOpcode::G_FMAXNUM: 751 case TargetOpcode::G_FSQRT: 752 case TargetOpcode::G_FRINT: 753 case TargetOpcode::G_FNEARBYINT: 754 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: { 755 Type *HLTy = getFloatTypeForLLT(Ctx, LLTy); 756 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) { 757 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n"); 758 return UnableToLegalize; 759 } 760 auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy); 761 if (Status != Legalized) 762 return Status; 763 break; 764 } 765 case TargetOpcode::G_FPEXT: 766 case TargetOpcode::G_FPTRUNC: { 767 Type *FromTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(1).getReg())); 768 Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg())); 769 if (!FromTy || !ToTy) 770 return UnableToLegalize; 771 LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy ); 772 if (Status != Legalized) 773 return Status; 774 break; 775 } 776 case TargetOpcode::G_FPTOSI: 777 case TargetOpcode::G_FPTOUI: { 778 // FIXME: Support other types 779 unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 780 unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 781 if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64)) 782 return UnableToLegalize; 783 LegalizeResult Status = conversionLibcall( 784 MI, MIRBuilder, 785 ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx), 786 FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx)); 787 if (Status != Legalized) 788 return Status; 789 break; 790 } 791 case TargetOpcode::G_SITOFP: 792 case TargetOpcode::G_UITOFP: { 793 // FIXME: Support other types 794 unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 795 unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 796 if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64)) 797 return UnableToLegalize; 798 LegalizeResult Status = conversionLibcall( 799 MI, MIRBuilder, 800 ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx), 801 FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx)); 802 if (Status != Legalized) 803 return Status; 804 break; 805 } 806 case TargetOpcode::G_BZERO: 807 case TargetOpcode::G_MEMCPY: 808 case TargetOpcode::G_MEMMOVE: 809 case TargetOpcode::G_MEMSET: { 810 LegalizeResult Result = 811 createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver); 812 if (Result != Legalized) 813 return Result; 814 MI.eraseFromParent(); 815 return Result; 816 } 817 } 818 819 MI.eraseFromParent(); 820 return Legalized; 821 } 822 823 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, 824 unsigned TypeIdx, 825 LLT NarrowTy) { 826 uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 827 uint64_t NarrowSize = NarrowTy.getSizeInBits(); 828 829 switch (MI.getOpcode()) { 830 default: 831 return UnableToLegalize; 832 case TargetOpcode::G_IMPLICIT_DEF: { 833 Register DstReg = MI.getOperand(0).getReg(); 834 LLT DstTy = MRI.getType(DstReg); 835 836 // If SizeOp0 is not an exact multiple of NarrowSize, emit 837 // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed. 838 // FIXME: Although this would also be legal for the general case, it causes 839 // a lot of regressions in the emitted code (superfluous COPYs, artifact 840 // combines not being hit). This seems to be a problem related to the 841 // artifact combiner. 842 if (SizeOp0 % NarrowSize != 0) { 843 LLT ImplicitTy = NarrowTy; 844 if (DstTy.isVector()) 845 ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy); 846 847 Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0); 848 MIRBuilder.buildAnyExt(DstReg, ImplicitReg); 849 850 MI.eraseFromParent(); 851 return Legalized; 852 } 853 854 int NumParts = SizeOp0 / NarrowSize; 855 856 SmallVector<Register, 2> DstRegs; 857 for (int i = 0; i < NumParts; ++i) 858 DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0)); 859 860 if (DstTy.isVector()) 861 MIRBuilder.buildBuildVector(DstReg, DstRegs); 862 else 863 MIRBuilder.buildMerge(DstReg, DstRegs); 864 MI.eraseFromParent(); 865 return Legalized; 866 } 867 case TargetOpcode::G_CONSTANT: { 868 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 869 const APInt &Val = MI.getOperand(1).getCImm()->getValue(); 870 unsigned TotalSize = Ty.getSizeInBits(); 871 unsigned NarrowSize = NarrowTy.getSizeInBits(); 872 int NumParts = TotalSize / NarrowSize; 873 874 SmallVector<Register, 4> PartRegs; 875 for (int I = 0; I != NumParts; ++I) { 876 unsigned Offset = I * NarrowSize; 877 auto K = MIRBuilder.buildConstant(NarrowTy, 878 Val.lshr(Offset).trunc(NarrowSize)); 879 PartRegs.push_back(K.getReg(0)); 880 } 881 882 LLT LeftoverTy; 883 unsigned LeftoverBits = TotalSize - NumParts * NarrowSize; 884 SmallVector<Register, 1> LeftoverRegs; 885 if (LeftoverBits != 0) { 886 LeftoverTy = LLT::scalar(LeftoverBits); 887 auto K = MIRBuilder.buildConstant( 888 LeftoverTy, 889 Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits)); 890 LeftoverRegs.push_back(K.getReg(0)); 891 } 892 893 insertParts(MI.getOperand(0).getReg(), 894 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs); 895 896 MI.eraseFromParent(); 897 return Legalized; 898 } 899 case TargetOpcode::G_SEXT: 900 case TargetOpcode::G_ZEXT: 901 case TargetOpcode::G_ANYEXT: 902 return narrowScalarExt(MI, TypeIdx, NarrowTy); 903 case TargetOpcode::G_TRUNC: { 904 if (TypeIdx != 1) 905 return UnableToLegalize; 906 907 uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 908 if (NarrowTy.getSizeInBits() * 2 != SizeOp1) { 909 LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n"); 910 return UnableToLegalize; 911 } 912 913 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1)); 914 MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0)); 915 MI.eraseFromParent(); 916 return Legalized; 917 } 918 919 case TargetOpcode::G_FREEZE: 920 return reduceOperationWidth(MI, TypeIdx, NarrowTy); 921 case TargetOpcode::G_ADD: 922 case TargetOpcode::G_SUB: 923 case TargetOpcode::G_SADDO: 924 case TargetOpcode::G_SSUBO: 925 case TargetOpcode::G_SADDE: 926 case TargetOpcode::G_SSUBE: 927 case TargetOpcode::G_UADDO: 928 case TargetOpcode::G_USUBO: 929 case TargetOpcode::G_UADDE: 930 case TargetOpcode::G_USUBE: 931 return narrowScalarAddSub(MI, TypeIdx, NarrowTy); 932 case TargetOpcode::G_MUL: 933 case TargetOpcode::G_UMULH: 934 return narrowScalarMul(MI, NarrowTy); 935 case TargetOpcode::G_EXTRACT: 936 return narrowScalarExtract(MI, TypeIdx, NarrowTy); 937 case TargetOpcode::G_INSERT: 938 return narrowScalarInsert(MI, TypeIdx, NarrowTy); 939 case TargetOpcode::G_LOAD: { 940 auto &LoadMI = cast<GLoad>(MI); 941 Register DstReg = LoadMI.getDstReg(); 942 LLT DstTy = MRI.getType(DstReg); 943 if (DstTy.isVector()) 944 return UnableToLegalize; 945 946 if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) { 947 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy); 948 MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO()); 949 MIRBuilder.buildAnyExt(DstReg, TmpReg); 950 LoadMI.eraseFromParent(); 951 return Legalized; 952 } 953 954 return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy); 955 } 956 case TargetOpcode::G_ZEXTLOAD: 957 case TargetOpcode::G_SEXTLOAD: { 958 auto &LoadMI = cast<GExtLoad>(MI); 959 Register DstReg = LoadMI.getDstReg(); 960 Register PtrReg = LoadMI.getPointerReg(); 961 962 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy); 963 auto &MMO = LoadMI.getMMO(); 964 unsigned MemSize = MMO.getSizeInBits(); 965 966 if (MemSize == NarrowSize) { 967 MIRBuilder.buildLoad(TmpReg, PtrReg, MMO); 968 } else if (MemSize < NarrowSize) { 969 MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO); 970 } else if (MemSize > NarrowSize) { 971 // FIXME: Need to split the load. 972 return UnableToLegalize; 973 } 974 975 if (isa<GZExtLoad>(LoadMI)) 976 MIRBuilder.buildZExt(DstReg, TmpReg); 977 else 978 MIRBuilder.buildSExt(DstReg, TmpReg); 979 980 LoadMI.eraseFromParent(); 981 return Legalized; 982 } 983 case TargetOpcode::G_STORE: { 984 auto &StoreMI = cast<GStore>(MI); 985 986 Register SrcReg = StoreMI.getValueReg(); 987 LLT SrcTy = MRI.getType(SrcReg); 988 if (SrcTy.isVector()) 989 return UnableToLegalize; 990 991 int NumParts = SizeOp0 / NarrowSize; 992 unsigned HandledSize = NumParts * NarrowTy.getSizeInBits(); 993 unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize; 994 if (SrcTy.isVector() && LeftoverBits != 0) 995 return UnableToLegalize; 996 997 if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) { 998 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy); 999 MIRBuilder.buildTrunc(TmpReg, SrcReg); 1000 MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO()); 1001 StoreMI.eraseFromParent(); 1002 return Legalized; 1003 } 1004 1005 return reduceLoadStoreWidth(StoreMI, 0, NarrowTy); 1006 } 1007 case TargetOpcode::G_SELECT: 1008 return narrowScalarSelect(MI, TypeIdx, NarrowTy); 1009 case TargetOpcode::G_AND: 1010 case TargetOpcode::G_OR: 1011 case TargetOpcode::G_XOR: { 1012 // Legalize bitwise operation: 1013 // A = BinOp<Ty> B, C 1014 // into: 1015 // B1, ..., BN = G_UNMERGE_VALUES B 1016 // C1, ..., CN = G_UNMERGE_VALUES C 1017 // A1 = BinOp<Ty/N> B1, C2 1018 // ... 1019 // AN = BinOp<Ty/N> BN, CN 1020 // A = G_MERGE_VALUES A1, ..., AN 1021 return narrowScalarBasic(MI, TypeIdx, NarrowTy); 1022 } 1023 case TargetOpcode::G_SHL: 1024 case TargetOpcode::G_LSHR: 1025 case TargetOpcode::G_ASHR: 1026 return narrowScalarShift(MI, TypeIdx, NarrowTy); 1027 case TargetOpcode::G_CTLZ: 1028 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 1029 case TargetOpcode::G_CTTZ: 1030 case TargetOpcode::G_CTTZ_ZERO_UNDEF: 1031 case TargetOpcode::G_CTPOP: 1032 if (TypeIdx == 1) 1033 switch (MI.getOpcode()) { 1034 case TargetOpcode::G_CTLZ: 1035 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 1036 return narrowScalarCTLZ(MI, TypeIdx, NarrowTy); 1037 case TargetOpcode::G_CTTZ: 1038 case TargetOpcode::G_CTTZ_ZERO_UNDEF: 1039 return narrowScalarCTTZ(MI, TypeIdx, NarrowTy); 1040 case TargetOpcode::G_CTPOP: 1041 return narrowScalarCTPOP(MI, TypeIdx, NarrowTy); 1042 default: 1043 return UnableToLegalize; 1044 } 1045 1046 Observer.changingInstr(MI); 1047 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT); 1048 Observer.changedInstr(MI); 1049 return Legalized; 1050 case TargetOpcode::G_INTTOPTR: 1051 if (TypeIdx != 1) 1052 return UnableToLegalize; 1053 1054 Observer.changingInstr(MI); 1055 narrowScalarSrc(MI, NarrowTy, 1); 1056 Observer.changedInstr(MI); 1057 return Legalized; 1058 case TargetOpcode::G_PTRTOINT: 1059 if (TypeIdx != 0) 1060 return UnableToLegalize; 1061 1062 Observer.changingInstr(MI); 1063 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT); 1064 Observer.changedInstr(MI); 1065 return Legalized; 1066 case TargetOpcode::G_PHI: { 1067 // FIXME: add support for when SizeOp0 isn't an exact multiple of 1068 // NarrowSize. 1069 if (SizeOp0 % NarrowSize != 0) 1070 return UnableToLegalize; 1071 1072 unsigned NumParts = SizeOp0 / NarrowSize; 1073 SmallVector<Register, 2> DstRegs(NumParts); 1074 SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2); 1075 Observer.changingInstr(MI); 1076 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { 1077 MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB(); 1078 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator()); 1079 extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts, 1080 SrcRegs[i / 2]); 1081 } 1082 MachineBasicBlock &MBB = *MI.getParent(); 1083 MIRBuilder.setInsertPt(MBB, MI); 1084 for (unsigned i = 0; i < NumParts; ++i) { 1085 DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy); 1086 MachineInstrBuilder MIB = 1087 MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]); 1088 for (unsigned j = 1; j < MI.getNumOperands(); j += 2) 1089 MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1)); 1090 } 1091 MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI()); 1092 MIRBuilder.buildMerge(MI.getOperand(0), DstRegs); 1093 Observer.changedInstr(MI); 1094 MI.eraseFromParent(); 1095 return Legalized; 1096 } 1097 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1098 case TargetOpcode::G_INSERT_VECTOR_ELT: { 1099 if (TypeIdx != 2) 1100 return UnableToLegalize; 1101 1102 int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3; 1103 Observer.changingInstr(MI); 1104 narrowScalarSrc(MI, NarrowTy, OpIdx); 1105 Observer.changedInstr(MI); 1106 return Legalized; 1107 } 1108 case TargetOpcode::G_ICMP: { 1109 Register LHS = MI.getOperand(2).getReg(); 1110 LLT SrcTy = MRI.getType(LHS); 1111 uint64_t SrcSize = SrcTy.getSizeInBits(); 1112 CmpInst::Predicate Pred = 1113 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 1114 1115 // TODO: Handle the non-equality case for weird sizes. 1116 if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred)) 1117 return UnableToLegalize; 1118 1119 LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover) 1120 SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs; 1121 if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs, 1122 LHSLeftoverRegs)) 1123 return UnableToLegalize; 1124 1125 LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type. 1126 SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs; 1127 if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused, 1128 RHSPartRegs, RHSLeftoverRegs)) 1129 return UnableToLegalize; 1130 1131 // We now have the LHS and RHS of the compare split into narrow-type 1132 // registers, plus potentially some leftover type. 1133 Register Dst = MI.getOperand(0).getReg(); 1134 LLT ResTy = MRI.getType(Dst); 1135 if (ICmpInst::isEquality(Pred)) { 1136 // For each part on the LHS and RHS, keep track of the result of XOR-ing 1137 // them together. For each equal part, the result should be all 0s. For 1138 // each non-equal part, we'll get at least one 1. 1139 auto Zero = MIRBuilder.buildConstant(NarrowTy, 0); 1140 SmallVector<Register, 4> Xors; 1141 for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) { 1142 auto LHS = std::get<0>(LHSAndRHS); 1143 auto RHS = std::get<1>(LHSAndRHS); 1144 auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0); 1145 Xors.push_back(Xor); 1146 } 1147 1148 // Build a G_XOR for each leftover register. Each G_XOR must be widened 1149 // to the desired narrow type so that we can OR them together later. 1150 SmallVector<Register, 4> WidenedXors; 1151 for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) { 1152 auto LHS = std::get<0>(LHSAndRHS); 1153 auto RHS = std::get<1>(LHSAndRHS); 1154 auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0); 1155 LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor); 1156 buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors, 1157 /* PadStrategy = */ TargetOpcode::G_ZEXT); 1158 Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end()); 1159 } 1160 1161 // Now, for each part we broke up, we know if they are equal/not equal 1162 // based off the G_XOR. We can OR these all together and compare against 1163 // 0 to get the result. 1164 assert(Xors.size() >= 2 && "Should have gotten at least two Xors?"); 1165 auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]); 1166 for (unsigned I = 2, E = Xors.size(); I < E; ++I) 1167 Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]); 1168 MIRBuilder.buildICmp(Pred, Dst, Or, Zero); 1169 } else { 1170 // TODO: Handle non-power-of-two types. 1171 assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?"); 1172 assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?"); 1173 Register LHSL = LHSPartRegs[0]; 1174 Register LHSH = LHSPartRegs[1]; 1175 Register RHSL = RHSPartRegs[0]; 1176 Register RHSH = RHSPartRegs[1]; 1177 MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH); 1178 MachineInstrBuilder CmpHEQ = 1179 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH); 1180 MachineInstrBuilder CmpLU = MIRBuilder.buildICmp( 1181 ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL); 1182 MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH); 1183 } 1184 MI.eraseFromParent(); 1185 return Legalized; 1186 } 1187 case TargetOpcode::G_SEXT_INREG: { 1188 if (TypeIdx != 0) 1189 return UnableToLegalize; 1190 1191 int64_t SizeInBits = MI.getOperand(2).getImm(); 1192 1193 // So long as the new type has more bits than the bits we're extending we 1194 // don't need to break it apart. 1195 if (NarrowTy.getScalarSizeInBits() >= SizeInBits) { 1196 Observer.changingInstr(MI); 1197 // We don't lose any non-extension bits by truncating the src and 1198 // sign-extending the dst. 1199 MachineOperand &MO1 = MI.getOperand(1); 1200 auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1); 1201 MO1.setReg(TruncMIB.getReg(0)); 1202 1203 MachineOperand &MO2 = MI.getOperand(0); 1204 Register DstExt = MRI.createGenericVirtualRegister(NarrowTy); 1205 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1206 MIRBuilder.buildSExt(MO2, DstExt); 1207 MO2.setReg(DstExt); 1208 Observer.changedInstr(MI); 1209 return Legalized; 1210 } 1211 1212 // Break it apart. Components below the extension point are unmodified. The 1213 // component containing the extension point becomes a narrower SEXT_INREG. 1214 // Components above it are ashr'd from the component containing the 1215 // extension point. 1216 if (SizeOp0 % NarrowSize != 0) 1217 return UnableToLegalize; 1218 int NumParts = SizeOp0 / NarrowSize; 1219 1220 // List the registers where the destination will be scattered. 1221 SmallVector<Register, 2> DstRegs; 1222 // List the registers where the source will be split. 1223 SmallVector<Register, 2> SrcRegs; 1224 1225 // Create all the temporary registers. 1226 for (int i = 0; i < NumParts; ++i) { 1227 Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy); 1228 1229 SrcRegs.push_back(SrcReg); 1230 } 1231 1232 // Explode the big arguments into smaller chunks. 1233 MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1)); 1234 1235 Register AshrCstReg = 1236 MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1) 1237 .getReg(0); 1238 Register FullExtensionReg = 0; 1239 Register PartialExtensionReg = 0; 1240 1241 // Do the operation on each small part. 1242 for (int i = 0; i < NumParts; ++i) { 1243 if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits) 1244 DstRegs.push_back(SrcRegs[i]); 1245 else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) { 1246 assert(PartialExtensionReg && 1247 "Expected to visit partial extension before full"); 1248 if (FullExtensionReg) { 1249 DstRegs.push_back(FullExtensionReg); 1250 continue; 1251 } 1252 DstRegs.push_back( 1253 MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg) 1254 .getReg(0)); 1255 FullExtensionReg = DstRegs.back(); 1256 } else { 1257 DstRegs.push_back( 1258 MIRBuilder 1259 .buildInstr( 1260 TargetOpcode::G_SEXT_INREG, {NarrowTy}, 1261 {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()}) 1262 .getReg(0)); 1263 PartialExtensionReg = DstRegs.back(); 1264 } 1265 } 1266 1267 // Gather the destination registers into the final destination. 1268 Register DstReg = MI.getOperand(0).getReg(); 1269 MIRBuilder.buildMerge(DstReg, DstRegs); 1270 MI.eraseFromParent(); 1271 return Legalized; 1272 } 1273 case TargetOpcode::G_BSWAP: 1274 case TargetOpcode::G_BITREVERSE: { 1275 if (SizeOp0 % NarrowSize != 0) 1276 return UnableToLegalize; 1277 1278 Observer.changingInstr(MI); 1279 SmallVector<Register, 2> SrcRegs, DstRegs; 1280 unsigned NumParts = SizeOp0 / NarrowSize; 1281 extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs); 1282 1283 for (unsigned i = 0; i < NumParts; ++i) { 1284 auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy}, 1285 {SrcRegs[NumParts - 1 - i]}); 1286 DstRegs.push_back(DstPart.getReg(0)); 1287 } 1288 1289 MIRBuilder.buildMerge(MI.getOperand(0), DstRegs); 1290 1291 Observer.changedInstr(MI); 1292 MI.eraseFromParent(); 1293 return Legalized; 1294 } 1295 case TargetOpcode::G_PTR_ADD: 1296 case TargetOpcode::G_PTRMASK: { 1297 if (TypeIdx != 1) 1298 return UnableToLegalize; 1299 Observer.changingInstr(MI); 1300 narrowScalarSrc(MI, NarrowTy, 2); 1301 Observer.changedInstr(MI); 1302 return Legalized; 1303 } 1304 case TargetOpcode::G_FPTOUI: 1305 case TargetOpcode::G_FPTOSI: 1306 return narrowScalarFPTOI(MI, TypeIdx, NarrowTy); 1307 case TargetOpcode::G_FPEXT: 1308 if (TypeIdx != 0) 1309 return UnableToLegalize; 1310 Observer.changingInstr(MI); 1311 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT); 1312 Observer.changedInstr(MI); 1313 return Legalized; 1314 } 1315 } 1316 1317 Register LegalizerHelper::coerceToScalar(Register Val) { 1318 LLT Ty = MRI.getType(Val); 1319 if (Ty.isScalar()) 1320 return Val; 1321 1322 const DataLayout &DL = MIRBuilder.getDataLayout(); 1323 LLT NewTy = LLT::scalar(Ty.getSizeInBits()); 1324 if (Ty.isPointer()) { 1325 if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace())) 1326 return Register(); 1327 return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0); 1328 } 1329 1330 Register NewVal = Val; 1331 1332 assert(Ty.isVector()); 1333 LLT EltTy = Ty.getElementType(); 1334 if (EltTy.isPointer()) 1335 NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0); 1336 return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0); 1337 } 1338 1339 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy, 1340 unsigned OpIdx, unsigned ExtOpcode) { 1341 MachineOperand &MO = MI.getOperand(OpIdx); 1342 auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO}); 1343 MO.setReg(ExtB.getReg(0)); 1344 } 1345 1346 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy, 1347 unsigned OpIdx) { 1348 MachineOperand &MO = MI.getOperand(OpIdx); 1349 auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO); 1350 MO.setReg(ExtB.getReg(0)); 1351 } 1352 1353 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy, 1354 unsigned OpIdx, unsigned TruncOpcode) { 1355 MachineOperand &MO = MI.getOperand(OpIdx); 1356 Register DstExt = MRI.createGenericVirtualRegister(WideTy); 1357 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1358 MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt}); 1359 MO.setReg(DstExt); 1360 } 1361 1362 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy, 1363 unsigned OpIdx, unsigned ExtOpcode) { 1364 MachineOperand &MO = MI.getOperand(OpIdx); 1365 Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy); 1366 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1367 MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc}); 1368 MO.setReg(DstTrunc); 1369 } 1370 1371 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy, 1372 unsigned OpIdx) { 1373 MachineOperand &MO = MI.getOperand(OpIdx); 1374 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1375 MO.setReg(widenWithUnmerge(WideTy, MO.getReg())); 1376 } 1377 1378 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy, 1379 unsigned OpIdx) { 1380 MachineOperand &MO = MI.getOperand(OpIdx); 1381 1382 LLT OldTy = MRI.getType(MO.getReg()); 1383 unsigned OldElts = OldTy.getNumElements(); 1384 unsigned NewElts = MoreTy.getNumElements(); 1385 1386 unsigned NumParts = NewElts / OldElts; 1387 1388 // Use concat_vectors if the result is a multiple of the number of elements. 1389 if (NumParts * OldElts == NewElts) { 1390 SmallVector<Register, 8> Parts; 1391 Parts.push_back(MO.getReg()); 1392 1393 Register ImpDef = MIRBuilder.buildUndef(OldTy).getReg(0); 1394 for (unsigned I = 1; I != NumParts; ++I) 1395 Parts.push_back(ImpDef); 1396 1397 auto Concat = MIRBuilder.buildConcatVectors(MoreTy, Parts); 1398 MO.setReg(Concat.getReg(0)); 1399 return; 1400 } 1401 1402 Register MoreReg = MRI.createGenericVirtualRegister(MoreTy); 1403 Register ImpDef = MIRBuilder.buildUndef(MoreTy).getReg(0); 1404 MIRBuilder.buildInsert(MoreReg, ImpDef, MO.getReg(), 0); 1405 MO.setReg(MoreReg); 1406 } 1407 1408 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) { 1409 MachineOperand &Op = MI.getOperand(OpIdx); 1410 Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0)); 1411 } 1412 1413 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) { 1414 MachineOperand &MO = MI.getOperand(OpIdx); 1415 Register CastDst = MRI.createGenericVirtualRegister(CastTy); 1416 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1417 MIRBuilder.buildBitcast(MO, CastDst); 1418 MO.setReg(CastDst); 1419 } 1420 1421 LegalizerHelper::LegalizeResult 1422 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, 1423 LLT WideTy) { 1424 if (TypeIdx != 1) 1425 return UnableToLegalize; 1426 1427 Register DstReg = MI.getOperand(0).getReg(); 1428 LLT DstTy = MRI.getType(DstReg); 1429 if (DstTy.isVector()) 1430 return UnableToLegalize; 1431 1432 Register Src1 = MI.getOperand(1).getReg(); 1433 LLT SrcTy = MRI.getType(Src1); 1434 const int DstSize = DstTy.getSizeInBits(); 1435 const int SrcSize = SrcTy.getSizeInBits(); 1436 const int WideSize = WideTy.getSizeInBits(); 1437 const int NumMerge = (DstSize + WideSize - 1) / WideSize; 1438 1439 unsigned NumOps = MI.getNumOperands(); 1440 unsigned NumSrc = MI.getNumOperands() - 1; 1441 unsigned PartSize = DstTy.getSizeInBits() / NumSrc; 1442 1443 if (WideSize >= DstSize) { 1444 // Directly pack the bits in the target type. 1445 Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0); 1446 1447 for (unsigned I = 2; I != NumOps; ++I) { 1448 const unsigned Offset = (I - 1) * PartSize; 1449 1450 Register SrcReg = MI.getOperand(I).getReg(); 1451 assert(MRI.getType(SrcReg) == LLT::scalar(PartSize)); 1452 1453 auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg); 1454 1455 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg : 1456 MRI.createGenericVirtualRegister(WideTy); 1457 1458 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset); 1459 auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt); 1460 MIRBuilder.buildOr(NextResult, ResultReg, Shl); 1461 ResultReg = NextResult; 1462 } 1463 1464 if (WideSize > DstSize) 1465 MIRBuilder.buildTrunc(DstReg, ResultReg); 1466 else if (DstTy.isPointer()) 1467 MIRBuilder.buildIntToPtr(DstReg, ResultReg); 1468 1469 MI.eraseFromParent(); 1470 return Legalized; 1471 } 1472 1473 // Unmerge the original values to the GCD type, and recombine to the next 1474 // multiple greater than the original type. 1475 // 1476 // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6 1477 // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0 1478 // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1 1479 // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2 1480 // %10:_(s6) = G_MERGE_VALUES %4, %5, %6 1481 // %11:_(s6) = G_MERGE_VALUES %7, %8, %9 1482 // %12:_(s12) = G_MERGE_VALUES %10, %11 1483 // 1484 // Padding with undef if necessary: 1485 // 1486 // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6 1487 // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0 1488 // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1 1489 // %7:_(s2) = G_IMPLICIT_DEF 1490 // %8:_(s6) = G_MERGE_VALUES %3, %4, %5 1491 // %9:_(s6) = G_MERGE_VALUES %6, %7, %7 1492 // %10:_(s12) = G_MERGE_VALUES %8, %9 1493 1494 const int GCD = greatestCommonDivisor(SrcSize, WideSize); 1495 LLT GCDTy = LLT::scalar(GCD); 1496 1497 SmallVector<Register, 8> Parts; 1498 SmallVector<Register, 8> NewMergeRegs; 1499 SmallVector<Register, 8> Unmerges; 1500 LLT WideDstTy = LLT::scalar(NumMerge * WideSize); 1501 1502 // Decompose the original operands if they don't evenly divide. 1503 for (int I = 1, E = MI.getNumOperands(); I != E; ++I) { 1504 Register SrcReg = MI.getOperand(I).getReg(); 1505 if (GCD == SrcSize) { 1506 Unmerges.push_back(SrcReg); 1507 } else { 1508 auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg); 1509 for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J) 1510 Unmerges.push_back(Unmerge.getReg(J)); 1511 } 1512 } 1513 1514 // Pad with undef to the next size that is a multiple of the requested size. 1515 if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) { 1516 Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0); 1517 for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I) 1518 Unmerges.push_back(UndefReg); 1519 } 1520 1521 const int PartsPerGCD = WideSize / GCD; 1522 1523 // Build merges of each piece. 1524 ArrayRef<Register> Slicer(Unmerges); 1525 for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) { 1526 auto Merge = MIRBuilder.buildMerge(WideTy, Slicer.take_front(PartsPerGCD)); 1527 NewMergeRegs.push_back(Merge.getReg(0)); 1528 } 1529 1530 // A truncate may be necessary if the requested type doesn't evenly divide the 1531 // original result type. 1532 if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) { 1533 MIRBuilder.buildMerge(DstReg, NewMergeRegs); 1534 } else { 1535 auto FinalMerge = MIRBuilder.buildMerge(WideDstTy, NewMergeRegs); 1536 MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0)); 1537 } 1538 1539 MI.eraseFromParent(); 1540 return Legalized; 1541 } 1542 1543 Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) { 1544 Register WideReg = MRI.createGenericVirtualRegister(WideTy); 1545 LLT OrigTy = MRI.getType(OrigReg); 1546 LLT LCMTy = getLCMType(WideTy, OrigTy); 1547 1548 const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits(); 1549 const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits(); 1550 1551 Register UnmergeSrc = WideReg; 1552 1553 // Create a merge to the LCM type, padding with undef 1554 // %0:_(<3 x s32>) = G_FOO => <4 x s32> 1555 // => 1556 // %1:_(<4 x s32>) = G_FOO 1557 // %2:_(<4 x s32>) = G_IMPLICIT_DEF 1558 // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2 1559 // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3 1560 if (NumMergeParts > 1) { 1561 Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0); 1562 SmallVector<Register, 8> MergeParts(NumMergeParts, Undef); 1563 MergeParts[0] = WideReg; 1564 UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0); 1565 } 1566 1567 // Unmerge to the original register and pad with dead defs. 1568 SmallVector<Register, 8> UnmergeResults(NumUnmergeParts); 1569 UnmergeResults[0] = OrigReg; 1570 for (int I = 1; I != NumUnmergeParts; ++I) 1571 UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy); 1572 1573 MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc); 1574 return WideReg; 1575 } 1576 1577 LegalizerHelper::LegalizeResult 1578 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx, 1579 LLT WideTy) { 1580 if (TypeIdx != 0) 1581 return UnableToLegalize; 1582 1583 int NumDst = MI.getNumOperands() - 1; 1584 Register SrcReg = MI.getOperand(NumDst).getReg(); 1585 LLT SrcTy = MRI.getType(SrcReg); 1586 if (SrcTy.isVector()) 1587 return UnableToLegalize; 1588 1589 Register Dst0Reg = MI.getOperand(0).getReg(); 1590 LLT DstTy = MRI.getType(Dst0Reg); 1591 if (!DstTy.isScalar()) 1592 return UnableToLegalize; 1593 1594 if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) { 1595 if (SrcTy.isPointer()) { 1596 const DataLayout &DL = MIRBuilder.getDataLayout(); 1597 if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) { 1598 LLVM_DEBUG( 1599 dbgs() << "Not casting non-integral address space integer\n"); 1600 return UnableToLegalize; 1601 } 1602 1603 SrcTy = LLT::scalar(SrcTy.getSizeInBits()); 1604 SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0); 1605 } 1606 1607 // Widen SrcTy to WideTy. This does not affect the result, but since the 1608 // user requested this size, it is probably better handled than SrcTy and 1609 // should reduce the total number of legalization artifacts 1610 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) { 1611 SrcTy = WideTy; 1612 SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0); 1613 } 1614 1615 // Theres no unmerge type to target. Directly extract the bits from the 1616 // source type 1617 unsigned DstSize = DstTy.getSizeInBits(); 1618 1619 MIRBuilder.buildTrunc(Dst0Reg, SrcReg); 1620 for (int I = 1; I != NumDst; ++I) { 1621 auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I); 1622 auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt); 1623 MIRBuilder.buildTrunc(MI.getOperand(I), Shr); 1624 } 1625 1626 MI.eraseFromParent(); 1627 return Legalized; 1628 } 1629 1630 // Extend the source to a wider type. 1631 LLT LCMTy = getLCMType(SrcTy, WideTy); 1632 1633 Register WideSrc = SrcReg; 1634 if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) { 1635 // TODO: If this is an integral address space, cast to integer and anyext. 1636 if (SrcTy.isPointer()) { 1637 LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n"); 1638 return UnableToLegalize; 1639 } 1640 1641 WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0); 1642 } 1643 1644 auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc); 1645 1646 // Create a sequence of unmerges and merges to the original results. Since we 1647 // may have widened the source, we will need to pad the results with dead defs 1648 // to cover the source register. 1649 // e.g. widen s48 to s64: 1650 // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96) 1651 // 1652 // => 1653 // %4:_(s192) = G_ANYEXT %0:_(s96) 1654 // %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge 1655 // ; unpack to GCD type, with extra dead defs 1656 // %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64) 1657 // %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64) 1658 // dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64) 1659 // %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10 ; Remerge to destination 1660 // %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination 1661 const LLT GCDTy = getGCDType(WideTy, DstTy); 1662 const int NumUnmerge = Unmerge->getNumOperands() - 1; 1663 const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits(); 1664 1665 // Directly unmerge to the destination without going through a GCD type 1666 // if possible 1667 if (PartsPerRemerge == 1) { 1668 const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits(); 1669 1670 for (int I = 0; I != NumUnmerge; ++I) { 1671 auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES); 1672 1673 for (int J = 0; J != PartsPerUnmerge; ++J) { 1674 int Idx = I * PartsPerUnmerge + J; 1675 if (Idx < NumDst) 1676 MIB.addDef(MI.getOperand(Idx).getReg()); 1677 else { 1678 // Create dead def for excess components. 1679 MIB.addDef(MRI.createGenericVirtualRegister(DstTy)); 1680 } 1681 } 1682 1683 MIB.addUse(Unmerge.getReg(I)); 1684 } 1685 } else { 1686 SmallVector<Register, 16> Parts; 1687 for (int J = 0; J != NumUnmerge; ++J) 1688 extractGCDType(Parts, GCDTy, Unmerge.getReg(J)); 1689 1690 SmallVector<Register, 8> RemergeParts; 1691 for (int I = 0; I != NumDst; ++I) { 1692 for (int J = 0; J < PartsPerRemerge; ++J) { 1693 const int Idx = I * PartsPerRemerge + J; 1694 RemergeParts.emplace_back(Parts[Idx]); 1695 } 1696 1697 MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts); 1698 RemergeParts.clear(); 1699 } 1700 } 1701 1702 MI.eraseFromParent(); 1703 return Legalized; 1704 } 1705 1706 LegalizerHelper::LegalizeResult 1707 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx, 1708 LLT WideTy) { 1709 Register DstReg = MI.getOperand(0).getReg(); 1710 Register SrcReg = MI.getOperand(1).getReg(); 1711 LLT SrcTy = MRI.getType(SrcReg); 1712 1713 LLT DstTy = MRI.getType(DstReg); 1714 unsigned Offset = MI.getOperand(2).getImm(); 1715 1716 if (TypeIdx == 0) { 1717 if (SrcTy.isVector() || DstTy.isVector()) 1718 return UnableToLegalize; 1719 1720 SrcOp Src(SrcReg); 1721 if (SrcTy.isPointer()) { 1722 // Extracts from pointers can be handled only if they are really just 1723 // simple integers. 1724 const DataLayout &DL = MIRBuilder.getDataLayout(); 1725 if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) 1726 return UnableToLegalize; 1727 1728 LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits()); 1729 Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src); 1730 SrcTy = SrcAsIntTy; 1731 } 1732 1733 if (DstTy.isPointer()) 1734 return UnableToLegalize; 1735 1736 if (Offset == 0) { 1737 // Avoid a shift in the degenerate case. 1738 MIRBuilder.buildTrunc(DstReg, 1739 MIRBuilder.buildAnyExtOrTrunc(WideTy, Src)); 1740 MI.eraseFromParent(); 1741 return Legalized; 1742 } 1743 1744 // Do a shift in the source type. 1745 LLT ShiftTy = SrcTy; 1746 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) { 1747 Src = MIRBuilder.buildAnyExt(WideTy, Src); 1748 ShiftTy = WideTy; 1749 } 1750 1751 auto LShr = MIRBuilder.buildLShr( 1752 ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset)); 1753 MIRBuilder.buildTrunc(DstReg, LShr); 1754 MI.eraseFromParent(); 1755 return Legalized; 1756 } 1757 1758 if (SrcTy.isScalar()) { 1759 Observer.changingInstr(MI); 1760 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 1761 Observer.changedInstr(MI); 1762 return Legalized; 1763 } 1764 1765 if (!SrcTy.isVector()) 1766 return UnableToLegalize; 1767 1768 if (DstTy != SrcTy.getElementType()) 1769 return UnableToLegalize; 1770 1771 if (Offset % SrcTy.getScalarSizeInBits() != 0) 1772 return UnableToLegalize; 1773 1774 Observer.changingInstr(MI); 1775 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 1776 1777 MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) * 1778 Offset); 1779 widenScalarDst(MI, WideTy.getScalarType(), 0); 1780 Observer.changedInstr(MI); 1781 return Legalized; 1782 } 1783 1784 LegalizerHelper::LegalizeResult 1785 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx, 1786 LLT WideTy) { 1787 if (TypeIdx != 0 || WideTy.isVector()) 1788 return UnableToLegalize; 1789 Observer.changingInstr(MI); 1790 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 1791 widenScalarDst(MI, WideTy); 1792 Observer.changedInstr(MI); 1793 return Legalized; 1794 } 1795 1796 LegalizerHelper::LegalizeResult 1797 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx, 1798 LLT WideTy) { 1799 if (TypeIdx == 1) 1800 return UnableToLegalize; // TODO 1801 1802 unsigned Opcode; 1803 unsigned ExtOpcode; 1804 Optional<Register> CarryIn = None; 1805 switch (MI.getOpcode()) { 1806 default: 1807 llvm_unreachable("Unexpected opcode!"); 1808 case TargetOpcode::G_SADDO: 1809 Opcode = TargetOpcode::G_ADD; 1810 ExtOpcode = TargetOpcode::G_SEXT; 1811 break; 1812 case TargetOpcode::G_SSUBO: 1813 Opcode = TargetOpcode::G_SUB; 1814 ExtOpcode = TargetOpcode::G_SEXT; 1815 break; 1816 case TargetOpcode::G_UADDO: 1817 Opcode = TargetOpcode::G_ADD; 1818 ExtOpcode = TargetOpcode::G_ZEXT; 1819 break; 1820 case TargetOpcode::G_USUBO: 1821 Opcode = TargetOpcode::G_SUB; 1822 ExtOpcode = TargetOpcode::G_ZEXT; 1823 break; 1824 case TargetOpcode::G_SADDE: 1825 Opcode = TargetOpcode::G_UADDE; 1826 ExtOpcode = TargetOpcode::G_SEXT; 1827 CarryIn = MI.getOperand(4).getReg(); 1828 break; 1829 case TargetOpcode::G_SSUBE: 1830 Opcode = TargetOpcode::G_USUBE; 1831 ExtOpcode = TargetOpcode::G_SEXT; 1832 CarryIn = MI.getOperand(4).getReg(); 1833 break; 1834 case TargetOpcode::G_UADDE: 1835 Opcode = TargetOpcode::G_UADDE; 1836 ExtOpcode = TargetOpcode::G_ZEXT; 1837 CarryIn = MI.getOperand(4).getReg(); 1838 break; 1839 case TargetOpcode::G_USUBE: 1840 Opcode = TargetOpcode::G_USUBE; 1841 ExtOpcode = TargetOpcode::G_ZEXT; 1842 CarryIn = MI.getOperand(4).getReg(); 1843 break; 1844 } 1845 1846 auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)}); 1847 auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)}); 1848 // Do the arithmetic in the larger type. 1849 Register NewOp; 1850 if (CarryIn) { 1851 LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg()); 1852 NewOp = MIRBuilder 1853 .buildInstr(Opcode, {WideTy, CarryOutTy}, 1854 {LHSExt, RHSExt, *CarryIn}) 1855 .getReg(0); 1856 } else { 1857 NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0); 1858 } 1859 LLT OrigTy = MRI.getType(MI.getOperand(0).getReg()); 1860 auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp); 1861 auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp}); 1862 // There is no overflow if the ExtOp is the same as NewOp. 1863 MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp); 1864 // Now trunc the NewOp to the original result. 1865 MIRBuilder.buildTrunc(MI.getOperand(0), NewOp); 1866 MI.eraseFromParent(); 1867 return Legalized; 1868 } 1869 1870 LegalizerHelper::LegalizeResult 1871 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx, 1872 LLT WideTy) { 1873 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT || 1874 MI.getOpcode() == TargetOpcode::G_SSUBSAT || 1875 MI.getOpcode() == TargetOpcode::G_SSHLSAT; 1876 bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT || 1877 MI.getOpcode() == TargetOpcode::G_USHLSAT; 1878 // We can convert this to: 1879 // 1. Any extend iN to iM 1880 // 2. SHL by M-N 1881 // 3. [US][ADD|SUB|SHL]SAT 1882 // 4. L/ASHR by M-N 1883 // 1884 // It may be more efficient to lower this to a min and a max operation in 1885 // the higher precision arithmetic if the promoted operation isn't legal, 1886 // but this decision is up to the target's lowering request. 1887 Register DstReg = MI.getOperand(0).getReg(); 1888 1889 unsigned NewBits = WideTy.getScalarSizeInBits(); 1890 unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits(); 1891 1892 // Shifts must zero-extend the RHS to preserve the unsigned quantity, and 1893 // must not left shift the RHS to preserve the shift amount. 1894 auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1)); 1895 auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2)) 1896 : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2)); 1897 auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount); 1898 auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK); 1899 auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK); 1900 1901 auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy}, 1902 {ShiftL, ShiftR}, MI.getFlags()); 1903 1904 // Use a shift that will preserve the number of sign bits when the trunc is 1905 // folded away. 1906 auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK) 1907 : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK); 1908 1909 MIRBuilder.buildTrunc(DstReg, Result); 1910 MI.eraseFromParent(); 1911 return Legalized; 1912 } 1913 1914 LegalizerHelper::LegalizeResult 1915 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx, 1916 LLT WideTy) { 1917 if (TypeIdx == 1) 1918 return UnableToLegalize; 1919 1920 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO; 1921 Register Result = MI.getOperand(0).getReg(); 1922 Register OriginalOverflow = MI.getOperand(1).getReg(); 1923 Register LHS = MI.getOperand(2).getReg(); 1924 Register RHS = MI.getOperand(3).getReg(); 1925 LLT SrcTy = MRI.getType(LHS); 1926 LLT OverflowTy = MRI.getType(OriginalOverflow); 1927 unsigned SrcBitWidth = SrcTy.getScalarSizeInBits(); 1928 1929 // To determine if the result overflowed in the larger type, we extend the 1930 // input to the larger type, do the multiply (checking if it overflows), 1931 // then also check the high bits of the result to see if overflow happened 1932 // there. 1933 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; 1934 auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS}); 1935 auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS}); 1936 1937 auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy}, 1938 {LeftOperand, RightOperand}); 1939 auto Mul = Mulo->getOperand(0); 1940 MIRBuilder.buildTrunc(Result, Mul); 1941 1942 MachineInstrBuilder ExtResult; 1943 // Overflow occurred if it occurred in the larger type, or if the high part 1944 // of the result does not zero/sign-extend the low part. Check this second 1945 // possibility first. 1946 if (IsSigned) { 1947 // For signed, overflow occurred when the high part does not sign-extend 1948 // the low part. 1949 ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth); 1950 } else { 1951 // Unsigned overflow occurred when the high part does not zero-extend the 1952 // low part. 1953 ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth); 1954 } 1955 1956 // Multiplication cannot overflow if the WideTy is >= 2 * original width, 1957 // so we don't need to check the overflow result of larger type Mulo. 1958 if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) { 1959 auto Overflow = 1960 MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult); 1961 // Finally check if the multiplication in the larger type itself overflowed. 1962 MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow); 1963 } else { 1964 MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult); 1965 } 1966 MI.eraseFromParent(); 1967 return Legalized; 1968 } 1969 1970 LegalizerHelper::LegalizeResult 1971 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { 1972 switch (MI.getOpcode()) { 1973 default: 1974 return UnableToLegalize; 1975 case TargetOpcode::G_ATOMICRMW_XCHG: 1976 case TargetOpcode::G_ATOMICRMW_ADD: 1977 case TargetOpcode::G_ATOMICRMW_SUB: 1978 case TargetOpcode::G_ATOMICRMW_AND: 1979 case TargetOpcode::G_ATOMICRMW_OR: 1980 case TargetOpcode::G_ATOMICRMW_XOR: 1981 case TargetOpcode::G_ATOMICRMW_MIN: 1982 case TargetOpcode::G_ATOMICRMW_MAX: 1983 case TargetOpcode::G_ATOMICRMW_UMIN: 1984 case TargetOpcode::G_ATOMICRMW_UMAX: 1985 assert(TypeIdx == 0 && "atomicrmw with second scalar type"); 1986 Observer.changingInstr(MI); 1987 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 1988 widenScalarDst(MI, WideTy, 0); 1989 Observer.changedInstr(MI); 1990 return Legalized; 1991 case TargetOpcode::G_ATOMIC_CMPXCHG: 1992 assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type"); 1993 Observer.changingInstr(MI); 1994 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 1995 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT); 1996 widenScalarDst(MI, WideTy, 0); 1997 Observer.changedInstr(MI); 1998 return Legalized; 1999 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: 2000 if (TypeIdx == 0) { 2001 Observer.changingInstr(MI); 2002 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT); 2003 widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT); 2004 widenScalarDst(MI, WideTy, 0); 2005 Observer.changedInstr(MI); 2006 return Legalized; 2007 } 2008 assert(TypeIdx == 1 && 2009 "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type"); 2010 Observer.changingInstr(MI); 2011 widenScalarDst(MI, WideTy, 1); 2012 Observer.changedInstr(MI); 2013 return Legalized; 2014 case TargetOpcode::G_EXTRACT: 2015 return widenScalarExtract(MI, TypeIdx, WideTy); 2016 case TargetOpcode::G_INSERT: 2017 return widenScalarInsert(MI, TypeIdx, WideTy); 2018 case TargetOpcode::G_MERGE_VALUES: 2019 return widenScalarMergeValues(MI, TypeIdx, WideTy); 2020 case TargetOpcode::G_UNMERGE_VALUES: 2021 return widenScalarUnmergeValues(MI, TypeIdx, WideTy); 2022 case TargetOpcode::G_SADDO: 2023 case TargetOpcode::G_SSUBO: 2024 case TargetOpcode::G_UADDO: 2025 case TargetOpcode::G_USUBO: 2026 case TargetOpcode::G_SADDE: 2027 case TargetOpcode::G_SSUBE: 2028 case TargetOpcode::G_UADDE: 2029 case TargetOpcode::G_USUBE: 2030 return widenScalarAddSubOverflow(MI, TypeIdx, WideTy); 2031 case TargetOpcode::G_UMULO: 2032 case TargetOpcode::G_SMULO: 2033 return widenScalarMulo(MI, TypeIdx, WideTy); 2034 case TargetOpcode::G_SADDSAT: 2035 case TargetOpcode::G_SSUBSAT: 2036 case TargetOpcode::G_SSHLSAT: 2037 case TargetOpcode::G_UADDSAT: 2038 case TargetOpcode::G_USUBSAT: 2039 case TargetOpcode::G_USHLSAT: 2040 return widenScalarAddSubShlSat(MI, TypeIdx, WideTy); 2041 case TargetOpcode::G_CTTZ: 2042 case TargetOpcode::G_CTTZ_ZERO_UNDEF: 2043 case TargetOpcode::G_CTLZ: 2044 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 2045 case TargetOpcode::G_CTPOP: { 2046 if (TypeIdx == 0) { 2047 Observer.changingInstr(MI); 2048 widenScalarDst(MI, WideTy, 0); 2049 Observer.changedInstr(MI); 2050 return Legalized; 2051 } 2052 2053 Register SrcReg = MI.getOperand(1).getReg(); 2054 2055 // First extend the input. 2056 unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ || 2057 MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF 2058 ? TargetOpcode::G_ANYEXT 2059 : TargetOpcode::G_ZEXT; 2060 auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg}); 2061 LLT CurTy = MRI.getType(SrcReg); 2062 unsigned NewOpc = MI.getOpcode(); 2063 if (NewOpc == TargetOpcode::G_CTTZ) { 2064 // The count is the same in the larger type except if the original 2065 // value was zero. This can be handled by setting the bit just off 2066 // the top of the original type. 2067 auto TopBit = 2068 APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits()); 2069 MIBSrc = MIRBuilder.buildOr( 2070 WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit)); 2071 // Now we know the operand is non-zero, use the more relaxed opcode. 2072 NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF; 2073 } 2074 2075 // Perform the operation at the larger size. 2076 auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc}); 2077 // This is already the correct result for CTPOP and CTTZs 2078 if (MI.getOpcode() == TargetOpcode::G_CTLZ || 2079 MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) { 2080 // The correct result is NewOp - (Difference in widety and current ty). 2081 unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits(); 2082 MIBNewOp = MIRBuilder.buildSub( 2083 WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff)); 2084 } 2085 2086 MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp); 2087 MI.eraseFromParent(); 2088 return Legalized; 2089 } 2090 case TargetOpcode::G_BSWAP: { 2091 Observer.changingInstr(MI); 2092 Register DstReg = MI.getOperand(0).getReg(); 2093 2094 Register ShrReg = MRI.createGenericVirtualRegister(WideTy); 2095 Register DstExt = MRI.createGenericVirtualRegister(WideTy); 2096 Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy); 2097 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2098 2099 MI.getOperand(0).setReg(DstExt); 2100 2101 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 2102 2103 LLT Ty = MRI.getType(DstReg); 2104 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits(); 2105 MIRBuilder.buildConstant(ShiftAmtReg, DiffBits); 2106 MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg); 2107 2108 MIRBuilder.buildTrunc(DstReg, ShrReg); 2109 Observer.changedInstr(MI); 2110 return Legalized; 2111 } 2112 case TargetOpcode::G_BITREVERSE: { 2113 Observer.changingInstr(MI); 2114 2115 Register DstReg = MI.getOperand(0).getReg(); 2116 LLT Ty = MRI.getType(DstReg); 2117 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits(); 2118 2119 Register DstExt = MRI.createGenericVirtualRegister(WideTy); 2120 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2121 MI.getOperand(0).setReg(DstExt); 2122 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 2123 2124 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits); 2125 auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt); 2126 MIRBuilder.buildTrunc(DstReg, Shift); 2127 Observer.changedInstr(MI); 2128 return Legalized; 2129 } 2130 case TargetOpcode::G_FREEZE: 2131 Observer.changingInstr(MI); 2132 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2133 widenScalarDst(MI, WideTy); 2134 Observer.changedInstr(MI); 2135 return Legalized; 2136 2137 case TargetOpcode::G_ABS: 2138 Observer.changingInstr(MI); 2139 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); 2140 widenScalarDst(MI, WideTy); 2141 Observer.changedInstr(MI); 2142 return Legalized; 2143 2144 case TargetOpcode::G_ADD: 2145 case TargetOpcode::G_AND: 2146 case TargetOpcode::G_MUL: 2147 case TargetOpcode::G_OR: 2148 case TargetOpcode::G_XOR: 2149 case TargetOpcode::G_SUB: 2150 // Perform operation at larger width (any extension is fines here, high bits 2151 // don't affect the result) and then truncate the result back to the 2152 // original type. 2153 Observer.changingInstr(MI); 2154 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2155 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 2156 widenScalarDst(MI, WideTy); 2157 Observer.changedInstr(MI); 2158 return Legalized; 2159 2160 case TargetOpcode::G_SBFX: 2161 case TargetOpcode::G_UBFX: 2162 Observer.changingInstr(MI); 2163 2164 if (TypeIdx == 0) { 2165 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2166 widenScalarDst(MI, WideTy); 2167 } else { 2168 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2169 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT); 2170 } 2171 2172 Observer.changedInstr(MI); 2173 return Legalized; 2174 2175 case TargetOpcode::G_SHL: 2176 Observer.changingInstr(MI); 2177 2178 if (TypeIdx == 0) { 2179 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2180 widenScalarDst(MI, WideTy); 2181 } else { 2182 assert(TypeIdx == 1); 2183 // The "number of bits to shift" operand must preserve its value as an 2184 // unsigned integer: 2185 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2186 } 2187 2188 Observer.changedInstr(MI); 2189 return Legalized; 2190 2191 case TargetOpcode::G_SDIV: 2192 case TargetOpcode::G_SREM: 2193 case TargetOpcode::G_SMIN: 2194 case TargetOpcode::G_SMAX: 2195 Observer.changingInstr(MI); 2196 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); 2197 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); 2198 widenScalarDst(MI, WideTy); 2199 Observer.changedInstr(MI); 2200 return Legalized; 2201 2202 case TargetOpcode::G_SDIVREM: 2203 Observer.changingInstr(MI); 2204 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); 2205 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT); 2206 widenScalarDst(MI, WideTy); 2207 widenScalarDst(MI, WideTy, 1); 2208 Observer.changedInstr(MI); 2209 return Legalized; 2210 2211 case TargetOpcode::G_ASHR: 2212 case TargetOpcode::G_LSHR: 2213 Observer.changingInstr(MI); 2214 2215 if (TypeIdx == 0) { 2216 unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ? 2217 TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; 2218 2219 widenScalarSrc(MI, WideTy, 1, CvtOp); 2220 widenScalarDst(MI, WideTy); 2221 } else { 2222 assert(TypeIdx == 1); 2223 // The "number of bits to shift" operand must preserve its value as an 2224 // unsigned integer: 2225 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2226 } 2227 2228 Observer.changedInstr(MI); 2229 return Legalized; 2230 case TargetOpcode::G_UDIV: 2231 case TargetOpcode::G_UREM: 2232 case TargetOpcode::G_UMIN: 2233 case TargetOpcode::G_UMAX: 2234 Observer.changingInstr(MI); 2235 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); 2236 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2237 widenScalarDst(MI, WideTy); 2238 Observer.changedInstr(MI); 2239 return Legalized; 2240 2241 case TargetOpcode::G_UDIVREM: 2242 Observer.changingInstr(MI); 2243 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2244 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT); 2245 widenScalarDst(MI, WideTy); 2246 widenScalarDst(MI, WideTy, 1); 2247 Observer.changedInstr(MI); 2248 return Legalized; 2249 2250 case TargetOpcode::G_SELECT: 2251 Observer.changingInstr(MI); 2252 if (TypeIdx == 0) { 2253 // Perform operation at larger width (any extension is fine here, high 2254 // bits don't affect the result) and then truncate the result back to the 2255 // original type. 2256 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 2257 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT); 2258 widenScalarDst(MI, WideTy); 2259 } else { 2260 bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector(); 2261 // Explicit extension is required here since high bits affect the result. 2262 widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false)); 2263 } 2264 Observer.changedInstr(MI); 2265 return Legalized; 2266 2267 case TargetOpcode::G_FPTOSI: 2268 case TargetOpcode::G_FPTOUI: 2269 Observer.changingInstr(MI); 2270 2271 if (TypeIdx == 0) 2272 widenScalarDst(MI, WideTy); 2273 else 2274 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); 2275 2276 Observer.changedInstr(MI); 2277 return Legalized; 2278 case TargetOpcode::G_SITOFP: 2279 Observer.changingInstr(MI); 2280 2281 if (TypeIdx == 0) 2282 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2283 else 2284 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); 2285 2286 Observer.changedInstr(MI); 2287 return Legalized; 2288 case TargetOpcode::G_UITOFP: 2289 Observer.changingInstr(MI); 2290 2291 if (TypeIdx == 0) 2292 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2293 else 2294 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); 2295 2296 Observer.changedInstr(MI); 2297 return Legalized; 2298 case TargetOpcode::G_LOAD: 2299 case TargetOpcode::G_SEXTLOAD: 2300 case TargetOpcode::G_ZEXTLOAD: 2301 Observer.changingInstr(MI); 2302 widenScalarDst(MI, WideTy); 2303 Observer.changedInstr(MI); 2304 return Legalized; 2305 2306 case TargetOpcode::G_STORE: { 2307 if (TypeIdx != 0) 2308 return UnableToLegalize; 2309 2310 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2311 if (!Ty.isScalar()) 2312 return UnableToLegalize; 2313 2314 Observer.changingInstr(MI); 2315 2316 unsigned ExtType = Ty.getScalarSizeInBits() == 1 ? 2317 TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT; 2318 widenScalarSrc(MI, WideTy, 0, ExtType); 2319 2320 Observer.changedInstr(MI); 2321 return Legalized; 2322 } 2323 case TargetOpcode::G_CONSTANT: { 2324 MachineOperand &SrcMO = MI.getOperand(1); 2325 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext(); 2326 unsigned ExtOpc = LI.getExtOpcodeForWideningConstant( 2327 MRI.getType(MI.getOperand(0).getReg())); 2328 assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT || 2329 ExtOpc == TargetOpcode::G_ANYEXT) && 2330 "Illegal Extend"); 2331 const APInt &SrcVal = SrcMO.getCImm()->getValue(); 2332 const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT) 2333 ? SrcVal.sext(WideTy.getSizeInBits()) 2334 : SrcVal.zext(WideTy.getSizeInBits()); 2335 Observer.changingInstr(MI); 2336 SrcMO.setCImm(ConstantInt::get(Ctx, Val)); 2337 2338 widenScalarDst(MI, WideTy); 2339 Observer.changedInstr(MI); 2340 return Legalized; 2341 } 2342 case TargetOpcode::G_FCONSTANT: { 2343 MachineOperand &SrcMO = MI.getOperand(1); 2344 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext(); 2345 APFloat Val = SrcMO.getFPImm()->getValueAPF(); 2346 bool LosesInfo; 2347 switch (WideTy.getSizeInBits()) { 2348 case 32: 2349 Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, 2350 &LosesInfo); 2351 break; 2352 case 64: 2353 Val.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, 2354 &LosesInfo); 2355 break; 2356 default: 2357 return UnableToLegalize; 2358 } 2359 2360 assert(!LosesInfo && "extend should always be lossless"); 2361 2362 Observer.changingInstr(MI); 2363 SrcMO.setFPImm(ConstantFP::get(Ctx, Val)); 2364 2365 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2366 Observer.changedInstr(MI); 2367 return Legalized; 2368 } 2369 case TargetOpcode::G_IMPLICIT_DEF: { 2370 Observer.changingInstr(MI); 2371 widenScalarDst(MI, WideTy); 2372 Observer.changedInstr(MI); 2373 return Legalized; 2374 } 2375 case TargetOpcode::G_BRCOND: 2376 Observer.changingInstr(MI); 2377 widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false)); 2378 Observer.changedInstr(MI); 2379 return Legalized; 2380 2381 case TargetOpcode::G_FCMP: 2382 Observer.changingInstr(MI); 2383 if (TypeIdx == 0) 2384 widenScalarDst(MI, WideTy); 2385 else { 2386 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT); 2387 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT); 2388 } 2389 Observer.changedInstr(MI); 2390 return Legalized; 2391 2392 case TargetOpcode::G_ICMP: 2393 Observer.changingInstr(MI); 2394 if (TypeIdx == 0) 2395 widenScalarDst(MI, WideTy); 2396 else { 2397 unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>( 2398 MI.getOperand(1).getPredicate())) 2399 ? TargetOpcode::G_SEXT 2400 : TargetOpcode::G_ZEXT; 2401 widenScalarSrc(MI, WideTy, 2, ExtOpcode); 2402 widenScalarSrc(MI, WideTy, 3, ExtOpcode); 2403 } 2404 Observer.changedInstr(MI); 2405 return Legalized; 2406 2407 case TargetOpcode::G_PTR_ADD: 2408 assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD"); 2409 Observer.changingInstr(MI); 2410 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); 2411 Observer.changedInstr(MI); 2412 return Legalized; 2413 2414 case TargetOpcode::G_PHI: { 2415 assert(TypeIdx == 0 && "Expecting only Idx 0"); 2416 2417 Observer.changingInstr(MI); 2418 for (unsigned I = 1; I < MI.getNumOperands(); I += 2) { 2419 MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB(); 2420 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator()); 2421 widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT); 2422 } 2423 2424 MachineBasicBlock &MBB = *MI.getParent(); 2425 MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI()); 2426 widenScalarDst(MI, WideTy); 2427 Observer.changedInstr(MI); 2428 return Legalized; 2429 } 2430 case TargetOpcode::G_EXTRACT_VECTOR_ELT: { 2431 if (TypeIdx == 0) { 2432 Register VecReg = MI.getOperand(1).getReg(); 2433 LLT VecTy = MRI.getType(VecReg); 2434 Observer.changingInstr(MI); 2435 2436 widenScalarSrc( 2437 MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1, 2438 TargetOpcode::G_ANYEXT); 2439 2440 widenScalarDst(MI, WideTy, 0); 2441 Observer.changedInstr(MI); 2442 return Legalized; 2443 } 2444 2445 if (TypeIdx != 2) 2446 return UnableToLegalize; 2447 Observer.changingInstr(MI); 2448 // TODO: Probably should be zext 2449 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); 2450 Observer.changedInstr(MI); 2451 return Legalized; 2452 } 2453 case TargetOpcode::G_INSERT_VECTOR_ELT: { 2454 if (TypeIdx == 1) { 2455 Observer.changingInstr(MI); 2456 2457 Register VecReg = MI.getOperand(1).getReg(); 2458 LLT VecTy = MRI.getType(VecReg); 2459 LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy); 2460 2461 widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT); 2462 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 2463 widenScalarDst(MI, WideVecTy, 0); 2464 Observer.changedInstr(MI); 2465 return Legalized; 2466 } 2467 2468 if (TypeIdx == 2) { 2469 Observer.changingInstr(MI); 2470 // TODO: Probably should be zext 2471 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT); 2472 Observer.changedInstr(MI); 2473 return Legalized; 2474 } 2475 2476 return UnableToLegalize; 2477 } 2478 case TargetOpcode::G_FADD: 2479 case TargetOpcode::G_FMUL: 2480 case TargetOpcode::G_FSUB: 2481 case TargetOpcode::G_FMA: 2482 case TargetOpcode::G_FMAD: 2483 case TargetOpcode::G_FNEG: 2484 case TargetOpcode::G_FABS: 2485 case TargetOpcode::G_FCANONICALIZE: 2486 case TargetOpcode::G_FMINNUM: 2487 case TargetOpcode::G_FMAXNUM: 2488 case TargetOpcode::G_FMINNUM_IEEE: 2489 case TargetOpcode::G_FMAXNUM_IEEE: 2490 case TargetOpcode::G_FMINIMUM: 2491 case TargetOpcode::G_FMAXIMUM: 2492 case TargetOpcode::G_FDIV: 2493 case TargetOpcode::G_FREM: 2494 case TargetOpcode::G_FCEIL: 2495 case TargetOpcode::G_FFLOOR: 2496 case TargetOpcode::G_FCOS: 2497 case TargetOpcode::G_FSIN: 2498 case TargetOpcode::G_FLOG10: 2499 case TargetOpcode::G_FLOG: 2500 case TargetOpcode::G_FLOG2: 2501 case TargetOpcode::G_FRINT: 2502 case TargetOpcode::G_FNEARBYINT: 2503 case TargetOpcode::G_FSQRT: 2504 case TargetOpcode::G_FEXP: 2505 case TargetOpcode::G_FEXP2: 2506 case TargetOpcode::G_FPOW: 2507 case TargetOpcode::G_INTRINSIC_TRUNC: 2508 case TargetOpcode::G_INTRINSIC_ROUND: 2509 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: 2510 assert(TypeIdx == 0); 2511 Observer.changingInstr(MI); 2512 2513 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) 2514 widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT); 2515 2516 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2517 Observer.changedInstr(MI); 2518 return Legalized; 2519 case TargetOpcode::G_FPOWI: { 2520 if (TypeIdx != 0) 2521 return UnableToLegalize; 2522 Observer.changingInstr(MI); 2523 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); 2524 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2525 Observer.changedInstr(MI); 2526 return Legalized; 2527 } 2528 case TargetOpcode::G_INTTOPTR: 2529 if (TypeIdx != 1) 2530 return UnableToLegalize; 2531 2532 Observer.changingInstr(MI); 2533 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); 2534 Observer.changedInstr(MI); 2535 return Legalized; 2536 case TargetOpcode::G_PTRTOINT: 2537 if (TypeIdx != 0) 2538 return UnableToLegalize; 2539 2540 Observer.changingInstr(MI); 2541 widenScalarDst(MI, WideTy, 0); 2542 Observer.changedInstr(MI); 2543 return Legalized; 2544 case TargetOpcode::G_BUILD_VECTOR: { 2545 Observer.changingInstr(MI); 2546 2547 const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType(); 2548 for (int I = 1, E = MI.getNumOperands(); I != E; ++I) 2549 widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT); 2550 2551 // Avoid changing the result vector type if the source element type was 2552 // requested. 2553 if (TypeIdx == 1) { 2554 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC)); 2555 } else { 2556 widenScalarDst(MI, WideTy, 0); 2557 } 2558 2559 Observer.changedInstr(MI); 2560 return Legalized; 2561 } 2562 case TargetOpcode::G_SEXT_INREG: 2563 if (TypeIdx != 0) 2564 return UnableToLegalize; 2565 2566 Observer.changingInstr(MI); 2567 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2568 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC); 2569 Observer.changedInstr(MI); 2570 return Legalized; 2571 case TargetOpcode::G_PTRMASK: { 2572 if (TypeIdx != 1) 2573 return UnableToLegalize; 2574 Observer.changingInstr(MI); 2575 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2576 Observer.changedInstr(MI); 2577 return Legalized; 2578 } 2579 } 2580 } 2581 2582 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces, 2583 MachineIRBuilder &B, Register Src, LLT Ty) { 2584 auto Unmerge = B.buildUnmerge(Ty, Src); 2585 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2586 Pieces.push_back(Unmerge.getReg(I)); 2587 } 2588 2589 LegalizerHelper::LegalizeResult 2590 LegalizerHelper::lowerBitcast(MachineInstr &MI) { 2591 Register Dst = MI.getOperand(0).getReg(); 2592 Register Src = MI.getOperand(1).getReg(); 2593 LLT DstTy = MRI.getType(Dst); 2594 LLT SrcTy = MRI.getType(Src); 2595 2596 if (SrcTy.isVector()) { 2597 LLT SrcEltTy = SrcTy.getElementType(); 2598 SmallVector<Register, 8> SrcRegs; 2599 2600 if (DstTy.isVector()) { 2601 int NumDstElt = DstTy.getNumElements(); 2602 int NumSrcElt = SrcTy.getNumElements(); 2603 2604 LLT DstEltTy = DstTy.getElementType(); 2605 LLT DstCastTy = DstEltTy; // Intermediate bitcast result type 2606 LLT SrcPartTy = SrcEltTy; // Original unmerge result type. 2607 2608 // If there's an element size mismatch, insert intermediate casts to match 2609 // the result element type. 2610 if (NumSrcElt < NumDstElt) { // Source element type is larger. 2611 // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>) 2612 // 2613 // => 2614 // 2615 // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0 2616 // %3:_(<2 x s8>) = G_BITCAST %2 2617 // %4:_(<2 x s8>) = G_BITCAST %3 2618 // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4 2619 DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy); 2620 SrcPartTy = SrcEltTy; 2621 } else if (NumSrcElt > NumDstElt) { // Source element type is smaller. 2622 // 2623 // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>) 2624 // 2625 // => 2626 // 2627 // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0 2628 // %3:_(s16) = G_BITCAST %2 2629 // %4:_(s16) = G_BITCAST %3 2630 // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4 2631 SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy); 2632 DstCastTy = DstEltTy; 2633 } 2634 2635 getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy); 2636 for (Register &SrcReg : SrcRegs) 2637 SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0); 2638 } else 2639 getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy); 2640 2641 MIRBuilder.buildMerge(Dst, SrcRegs); 2642 MI.eraseFromParent(); 2643 return Legalized; 2644 } 2645 2646 if (DstTy.isVector()) { 2647 SmallVector<Register, 8> SrcRegs; 2648 getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType()); 2649 MIRBuilder.buildMerge(Dst, SrcRegs); 2650 MI.eraseFromParent(); 2651 return Legalized; 2652 } 2653 2654 return UnableToLegalize; 2655 } 2656 2657 /// Figure out the bit offset into a register when coercing a vector index for 2658 /// the wide element type. This is only for the case when promoting vector to 2659 /// one with larger elements. 2660 // 2661 /// 2662 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize)) 2663 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize) 2664 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B, 2665 Register Idx, 2666 unsigned NewEltSize, 2667 unsigned OldEltSize) { 2668 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); 2669 LLT IdxTy = B.getMRI()->getType(Idx); 2670 2671 // Now figure out the amount we need to shift to get the target bits. 2672 auto OffsetMask = B.buildConstant( 2673 IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio)); 2674 auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask); 2675 return B.buildShl(IdxTy, OffsetIdx, 2676 B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0); 2677 } 2678 2679 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this 2680 /// is casting to a vector with a smaller element size, perform multiple element 2681 /// extracts and merge the results. If this is coercing to a vector with larger 2682 /// elements, index the bitcasted vector and extract the target element with bit 2683 /// operations. This is intended to force the indexing in the native register 2684 /// size for architectures that can dynamically index the register file. 2685 LegalizerHelper::LegalizeResult 2686 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx, 2687 LLT CastTy) { 2688 if (TypeIdx != 1) 2689 return UnableToLegalize; 2690 2691 Register Dst = MI.getOperand(0).getReg(); 2692 Register SrcVec = MI.getOperand(1).getReg(); 2693 Register Idx = MI.getOperand(2).getReg(); 2694 LLT SrcVecTy = MRI.getType(SrcVec); 2695 LLT IdxTy = MRI.getType(Idx); 2696 2697 LLT SrcEltTy = SrcVecTy.getElementType(); 2698 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1; 2699 unsigned OldNumElts = SrcVecTy.getNumElements(); 2700 2701 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy; 2702 Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0); 2703 2704 const unsigned NewEltSize = NewEltTy.getSizeInBits(); 2705 const unsigned OldEltSize = SrcEltTy.getSizeInBits(); 2706 if (NewNumElts > OldNumElts) { 2707 // Decreasing the vector element size 2708 // 2709 // e.g. i64 = extract_vector_elt x:v2i64, y:i32 2710 // => 2711 // v4i32:castx = bitcast x:v2i64 2712 // 2713 // i64 = bitcast 2714 // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))), 2715 // (i32 (extract_vector_elt castx, (2 * y + 1))) 2716 // 2717 if (NewNumElts % OldNumElts != 0) 2718 return UnableToLegalize; 2719 2720 // Type of the intermediate result vector. 2721 const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts; 2722 LLT MidTy = 2723 LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy); 2724 2725 auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt); 2726 2727 SmallVector<Register, 8> NewOps(NewEltsPerOldElt); 2728 auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK); 2729 2730 for (unsigned I = 0; I < NewEltsPerOldElt; ++I) { 2731 auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I); 2732 auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset); 2733 auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx); 2734 NewOps[I] = Elt.getReg(0); 2735 } 2736 2737 auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps); 2738 MIRBuilder.buildBitcast(Dst, NewVec); 2739 MI.eraseFromParent(); 2740 return Legalized; 2741 } 2742 2743 if (NewNumElts < OldNumElts) { 2744 if (NewEltSize % OldEltSize != 0) 2745 return UnableToLegalize; 2746 2747 // This only depends on powers of 2 because we use bit tricks to figure out 2748 // the bit offset we need to shift to get the target element. A general 2749 // expansion could emit division/multiply. 2750 if (!isPowerOf2_32(NewEltSize / OldEltSize)) 2751 return UnableToLegalize; 2752 2753 // Increasing the vector element size. 2754 // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx 2755 // 2756 // => 2757 // 2758 // %cast = G_BITCAST %vec 2759 // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize) 2760 // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx 2761 // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize)) 2762 // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize) 2763 // %elt_bits = G_LSHR %wide_elt, %offset_bits 2764 // %elt = G_TRUNC %elt_bits 2765 2766 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); 2767 auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio); 2768 2769 // Divide to get the index in the wider element type. 2770 auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio); 2771 2772 Register WideElt = CastVec; 2773 if (CastTy.isVector()) { 2774 WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, 2775 ScaledIdx).getReg(0); 2776 } 2777 2778 // Compute the bit offset into the register of the target element. 2779 Register OffsetBits = getBitcastWiderVectorElementOffset( 2780 MIRBuilder, Idx, NewEltSize, OldEltSize); 2781 2782 // Shift the wide element to get the target element. 2783 auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits); 2784 MIRBuilder.buildTrunc(Dst, ExtractedBits); 2785 MI.eraseFromParent(); 2786 return Legalized; 2787 } 2788 2789 return UnableToLegalize; 2790 } 2791 2792 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p 2793 /// TargetReg, while preserving other bits in \p TargetReg. 2794 /// 2795 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset) 2796 static Register buildBitFieldInsert(MachineIRBuilder &B, 2797 Register TargetReg, Register InsertReg, 2798 Register OffsetBits) { 2799 LLT TargetTy = B.getMRI()->getType(TargetReg); 2800 LLT InsertTy = B.getMRI()->getType(InsertReg); 2801 auto ZextVal = B.buildZExt(TargetTy, InsertReg); 2802 auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits); 2803 2804 // Produce a bitmask of the value to insert 2805 auto EltMask = B.buildConstant( 2806 TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(), 2807 InsertTy.getSizeInBits())); 2808 // Shift it into position 2809 auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits); 2810 auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask); 2811 2812 // Clear out the bits in the wide element 2813 auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask); 2814 2815 // The value to insert has all zeros already, so stick it into the masked 2816 // wide element. 2817 return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0); 2818 } 2819 2820 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this 2821 /// is increasing the element size, perform the indexing in the target element 2822 /// type, and use bit operations to insert at the element position. This is 2823 /// intended for architectures that can dynamically index the register file and 2824 /// want to force indexing in the native register size. 2825 LegalizerHelper::LegalizeResult 2826 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx, 2827 LLT CastTy) { 2828 if (TypeIdx != 0) 2829 return UnableToLegalize; 2830 2831 Register Dst = MI.getOperand(0).getReg(); 2832 Register SrcVec = MI.getOperand(1).getReg(); 2833 Register Val = MI.getOperand(2).getReg(); 2834 Register Idx = MI.getOperand(3).getReg(); 2835 2836 LLT VecTy = MRI.getType(Dst); 2837 LLT IdxTy = MRI.getType(Idx); 2838 2839 LLT VecEltTy = VecTy.getElementType(); 2840 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy; 2841 const unsigned NewEltSize = NewEltTy.getSizeInBits(); 2842 const unsigned OldEltSize = VecEltTy.getSizeInBits(); 2843 2844 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1; 2845 unsigned OldNumElts = VecTy.getNumElements(); 2846 2847 Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0); 2848 if (NewNumElts < OldNumElts) { 2849 if (NewEltSize % OldEltSize != 0) 2850 return UnableToLegalize; 2851 2852 // This only depends on powers of 2 because we use bit tricks to figure out 2853 // the bit offset we need to shift to get the target element. A general 2854 // expansion could emit division/multiply. 2855 if (!isPowerOf2_32(NewEltSize / OldEltSize)) 2856 return UnableToLegalize; 2857 2858 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); 2859 auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio); 2860 2861 // Divide to get the index in the wider element type. 2862 auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio); 2863 2864 Register ExtractedElt = CastVec; 2865 if (CastTy.isVector()) { 2866 ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, 2867 ScaledIdx).getReg(0); 2868 } 2869 2870 // Compute the bit offset into the register of the target element. 2871 Register OffsetBits = getBitcastWiderVectorElementOffset( 2872 MIRBuilder, Idx, NewEltSize, OldEltSize); 2873 2874 Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt, 2875 Val, OffsetBits); 2876 if (CastTy.isVector()) { 2877 InsertedElt = MIRBuilder.buildInsertVectorElement( 2878 CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0); 2879 } 2880 2881 MIRBuilder.buildBitcast(Dst, InsertedElt); 2882 MI.eraseFromParent(); 2883 return Legalized; 2884 } 2885 2886 return UnableToLegalize; 2887 } 2888 2889 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) { 2890 // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT 2891 Register DstReg = LoadMI.getDstReg(); 2892 Register PtrReg = LoadMI.getPointerReg(); 2893 LLT DstTy = MRI.getType(DstReg); 2894 MachineMemOperand &MMO = LoadMI.getMMO(); 2895 LLT MemTy = MMO.getMemoryType(); 2896 MachineFunction &MF = MIRBuilder.getMF(); 2897 2898 unsigned MemSizeInBits = MemTy.getSizeInBits(); 2899 unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes(); 2900 2901 if (MemSizeInBits != MemStoreSizeInBits) { 2902 if (MemTy.isVector()) 2903 return UnableToLegalize; 2904 2905 // Promote to a byte-sized load if not loading an integral number of 2906 // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24. 2907 LLT WideMemTy = LLT::scalar(MemStoreSizeInBits); 2908 MachineMemOperand *NewMMO = 2909 MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy); 2910 2911 Register LoadReg = DstReg; 2912 LLT LoadTy = DstTy; 2913 2914 // If this wasn't already an extending load, we need to widen the result 2915 // register to avoid creating a load with a narrower result than the source. 2916 if (MemStoreSizeInBits > DstTy.getSizeInBits()) { 2917 LoadTy = WideMemTy; 2918 LoadReg = MRI.createGenericVirtualRegister(WideMemTy); 2919 } 2920 2921 if (isa<GSExtLoad>(LoadMI)) { 2922 auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO); 2923 MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits); 2924 } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == DstTy) { 2925 auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO); 2926 // The extra bits are guaranteed to be zero, since we stored them that 2927 // way. A zext load from Wide thus automatically gives zext from MemVT. 2928 MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits); 2929 } else { 2930 MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO); 2931 } 2932 2933 if (DstTy != LoadTy) 2934 MIRBuilder.buildTrunc(DstReg, LoadReg); 2935 2936 LoadMI.eraseFromParent(); 2937 return Legalized; 2938 } 2939 2940 // Big endian lowering not implemented. 2941 if (MIRBuilder.getDataLayout().isBigEndian()) 2942 return UnableToLegalize; 2943 2944 // This load needs splitting into power of 2 sized loads. 2945 // 2946 // Our strategy here is to generate anyextending loads for the smaller 2947 // types up to next power-2 result type, and then combine the two larger 2948 // result values together, before truncating back down to the non-pow-2 2949 // type. 2950 // E.g. v1 = i24 load => 2951 // v2 = i32 zextload (2 byte) 2952 // v3 = i32 load (1 byte) 2953 // v4 = i32 shl v3, 16 2954 // v5 = i32 or v4, v2 2955 // v1 = i24 trunc v5 2956 // By doing this we generate the correct truncate which should get 2957 // combined away as an artifact with a matching extend. 2958 2959 uint64_t LargeSplitSize, SmallSplitSize; 2960 2961 if (!isPowerOf2_32(MemSizeInBits)) { 2962 // This load needs splitting into power of 2 sized loads. 2963 LargeSplitSize = PowerOf2Floor(MemSizeInBits); 2964 SmallSplitSize = MemSizeInBits - LargeSplitSize; 2965 } else { 2966 // This is already a power of 2, but we still need to split this in half. 2967 // 2968 // Assume we're being asked to decompose an unaligned load. 2969 // TODO: If this requires multiple splits, handle them all at once. 2970 auto &Ctx = MF.getFunction().getContext(); 2971 if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO)) 2972 return UnableToLegalize; 2973 2974 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2; 2975 } 2976 2977 if (MemTy.isVector()) { 2978 // TODO: Handle vector extloads 2979 if (MemTy != DstTy) 2980 return UnableToLegalize; 2981 2982 // TODO: We can do better than scalarizing the vector and at least split it 2983 // in half. 2984 return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType()); 2985 } 2986 2987 MachineMemOperand *LargeMMO = 2988 MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); 2989 MachineMemOperand *SmallMMO = 2990 MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8); 2991 2992 LLT PtrTy = MRI.getType(PtrReg); 2993 unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits()); 2994 LLT AnyExtTy = LLT::scalar(AnyExtSize); 2995 auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy, 2996 PtrReg, *LargeMMO); 2997 2998 auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), 2999 LargeSplitSize / 8); 3000 Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy); 3001 auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst); 3002 auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy, 3003 SmallPtr, *SmallMMO); 3004 3005 auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize); 3006 auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt); 3007 3008 if (AnyExtTy == DstTy) 3009 MIRBuilder.buildOr(DstReg, Shift, LargeLoad); 3010 else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) { 3011 auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad); 3012 MIRBuilder.buildTrunc(DstReg, {Or}); 3013 } else { 3014 assert(DstTy.isPointer() && "expected pointer"); 3015 auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad); 3016 3017 // FIXME: We currently consider this to be illegal for non-integral address 3018 // spaces, but we need still need a way to reinterpret the bits. 3019 MIRBuilder.buildIntToPtr(DstReg, Or); 3020 } 3021 3022 LoadMI.eraseFromParent(); 3023 return Legalized; 3024 } 3025 3026 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) { 3027 // Lower a non-power of 2 store into multiple pow-2 stores. 3028 // E.g. split an i24 store into an i16 store + i8 store. 3029 // We do this by first extending the stored value to the next largest power 3030 // of 2 type, and then using truncating stores to store the components. 3031 // By doing this, likewise with G_LOAD, generate an extend that can be 3032 // artifact-combined away instead of leaving behind extracts. 3033 Register SrcReg = StoreMI.getValueReg(); 3034 Register PtrReg = StoreMI.getPointerReg(); 3035 LLT SrcTy = MRI.getType(SrcReg); 3036 MachineFunction &MF = MIRBuilder.getMF(); 3037 MachineMemOperand &MMO = **StoreMI.memoperands_begin(); 3038 LLT MemTy = MMO.getMemoryType(); 3039 3040 unsigned StoreWidth = MemTy.getSizeInBits(); 3041 unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes(); 3042 3043 if (StoreWidth != StoreSizeInBits) { 3044 if (SrcTy.isVector()) 3045 return UnableToLegalize; 3046 3047 // Promote to a byte-sized store with upper bits zero if not 3048 // storing an integral number of bytes. For example, promote 3049 // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1) 3050 LLT WideTy = LLT::scalar(StoreSizeInBits); 3051 3052 if (StoreSizeInBits > SrcTy.getSizeInBits()) { 3053 // Avoid creating a store with a narrower source than result. 3054 SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0); 3055 SrcTy = WideTy; 3056 } 3057 3058 auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth); 3059 3060 MachineMemOperand *NewMMO = 3061 MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy); 3062 MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO); 3063 StoreMI.eraseFromParent(); 3064 return Legalized; 3065 } 3066 3067 if (MemTy.isVector()) { 3068 // TODO: Handle vector trunc stores 3069 if (MemTy != SrcTy) 3070 return UnableToLegalize; 3071 3072 // TODO: We can do better than scalarizing the vector and at least split it 3073 // in half. 3074 return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType()); 3075 } 3076 3077 unsigned MemSizeInBits = MemTy.getSizeInBits(); 3078 uint64_t LargeSplitSize, SmallSplitSize; 3079 3080 if (!isPowerOf2_32(MemSizeInBits)) { 3081 LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits()); 3082 SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize; 3083 } else { 3084 auto &Ctx = MF.getFunction().getContext(); 3085 if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO)) 3086 return UnableToLegalize; // Don't know what we're being asked to do. 3087 3088 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2; 3089 } 3090 3091 // Extend to the next pow-2. If this store was itself the result of lowering, 3092 // e.g. an s56 store being broken into s32 + s24, we might have a stored type 3093 // that's wider than the stored size. 3094 unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits()); 3095 const LLT NewSrcTy = LLT::scalar(AnyExtSize); 3096 3097 if (SrcTy.isPointer()) { 3098 const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits()); 3099 SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0); 3100 } 3101 3102 auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg); 3103 3104 // Obtain the smaller value by shifting away the larger value. 3105 auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize); 3106 auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt); 3107 3108 // Generate the PtrAdd and truncating stores. 3109 LLT PtrTy = MRI.getType(PtrReg); 3110 auto OffsetCst = MIRBuilder.buildConstant( 3111 LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); 3112 auto SmallPtr = 3113 MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst); 3114 3115 MachineMemOperand *LargeMMO = 3116 MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); 3117 MachineMemOperand *SmallMMO = 3118 MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8); 3119 MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO); 3120 MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO); 3121 StoreMI.eraseFromParent(); 3122 return Legalized; 3123 } 3124 3125 LegalizerHelper::LegalizeResult 3126 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) { 3127 switch (MI.getOpcode()) { 3128 case TargetOpcode::G_LOAD: { 3129 if (TypeIdx != 0) 3130 return UnableToLegalize; 3131 MachineMemOperand &MMO = **MI.memoperands_begin(); 3132 3133 // Not sure how to interpret a bitcast of an extending load. 3134 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits()) 3135 return UnableToLegalize; 3136 3137 Observer.changingInstr(MI); 3138 bitcastDst(MI, CastTy, 0); 3139 MMO.setType(CastTy); 3140 Observer.changedInstr(MI); 3141 return Legalized; 3142 } 3143 case TargetOpcode::G_STORE: { 3144 if (TypeIdx != 0) 3145 return UnableToLegalize; 3146 3147 MachineMemOperand &MMO = **MI.memoperands_begin(); 3148 3149 // Not sure how to interpret a bitcast of a truncating store. 3150 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits()) 3151 return UnableToLegalize; 3152 3153 Observer.changingInstr(MI); 3154 bitcastSrc(MI, CastTy, 0); 3155 MMO.setType(CastTy); 3156 Observer.changedInstr(MI); 3157 return Legalized; 3158 } 3159 case TargetOpcode::G_SELECT: { 3160 if (TypeIdx != 0) 3161 return UnableToLegalize; 3162 3163 if (MRI.getType(MI.getOperand(1).getReg()).isVector()) { 3164 LLVM_DEBUG( 3165 dbgs() << "bitcast action not implemented for vector select\n"); 3166 return UnableToLegalize; 3167 } 3168 3169 Observer.changingInstr(MI); 3170 bitcastSrc(MI, CastTy, 2); 3171 bitcastSrc(MI, CastTy, 3); 3172 bitcastDst(MI, CastTy, 0); 3173 Observer.changedInstr(MI); 3174 return Legalized; 3175 } 3176 case TargetOpcode::G_AND: 3177 case TargetOpcode::G_OR: 3178 case TargetOpcode::G_XOR: { 3179 Observer.changingInstr(MI); 3180 bitcastSrc(MI, CastTy, 1); 3181 bitcastSrc(MI, CastTy, 2); 3182 bitcastDst(MI, CastTy, 0); 3183 Observer.changedInstr(MI); 3184 return Legalized; 3185 } 3186 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 3187 return bitcastExtractVectorElt(MI, TypeIdx, CastTy); 3188 case TargetOpcode::G_INSERT_VECTOR_ELT: 3189 return bitcastInsertVectorElt(MI, TypeIdx, CastTy); 3190 default: 3191 return UnableToLegalize; 3192 } 3193 } 3194 3195 // Legalize an instruction by changing the opcode in place. 3196 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) { 3197 Observer.changingInstr(MI); 3198 MI.setDesc(MIRBuilder.getTII().get(NewOpcode)); 3199 Observer.changedInstr(MI); 3200 } 3201 3202 LegalizerHelper::LegalizeResult 3203 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { 3204 using namespace TargetOpcode; 3205 3206 switch(MI.getOpcode()) { 3207 default: 3208 return UnableToLegalize; 3209 case TargetOpcode::G_BITCAST: 3210 return lowerBitcast(MI); 3211 case TargetOpcode::G_SREM: 3212 case TargetOpcode::G_UREM: { 3213 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3214 auto Quot = 3215 MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty}, 3216 {MI.getOperand(1), MI.getOperand(2)}); 3217 3218 auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2)); 3219 MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod); 3220 MI.eraseFromParent(); 3221 return Legalized; 3222 } 3223 case TargetOpcode::G_SADDO: 3224 case TargetOpcode::G_SSUBO: 3225 return lowerSADDO_SSUBO(MI); 3226 case TargetOpcode::G_UMULH: 3227 case TargetOpcode::G_SMULH: 3228 return lowerSMULH_UMULH(MI); 3229 case TargetOpcode::G_SMULO: 3230 case TargetOpcode::G_UMULO: { 3231 // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the 3232 // result. 3233 Register Res = MI.getOperand(0).getReg(); 3234 Register Overflow = MI.getOperand(1).getReg(); 3235 Register LHS = MI.getOperand(2).getReg(); 3236 Register RHS = MI.getOperand(3).getReg(); 3237 LLT Ty = MRI.getType(Res); 3238 3239 unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO 3240 ? TargetOpcode::G_SMULH 3241 : TargetOpcode::G_UMULH; 3242 3243 Observer.changingInstr(MI); 3244 const auto &TII = MIRBuilder.getTII(); 3245 MI.setDesc(TII.get(TargetOpcode::G_MUL)); 3246 MI.RemoveOperand(1); 3247 Observer.changedInstr(MI); 3248 3249 auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS}); 3250 auto Zero = MIRBuilder.buildConstant(Ty, 0); 3251 3252 // Move insert point forward so we can use the Res register if needed. 3253 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 3254 3255 // For *signed* multiply, overflow is detected by checking: 3256 // (hi != (lo >> bitwidth-1)) 3257 if (Opcode == TargetOpcode::G_SMULH) { 3258 auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1); 3259 auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt); 3260 MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted); 3261 } else { 3262 MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero); 3263 } 3264 return Legalized; 3265 } 3266 case TargetOpcode::G_FNEG: { 3267 Register Res = MI.getOperand(0).getReg(); 3268 LLT Ty = MRI.getType(Res); 3269 3270 // TODO: Handle vector types once we are able to 3271 // represent them. 3272 if (Ty.isVector()) 3273 return UnableToLegalize; 3274 auto SignMask = 3275 MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits())); 3276 Register SubByReg = MI.getOperand(1).getReg(); 3277 MIRBuilder.buildXor(Res, SubByReg, SignMask); 3278 MI.eraseFromParent(); 3279 return Legalized; 3280 } 3281 case TargetOpcode::G_FSUB: { 3282 Register Res = MI.getOperand(0).getReg(); 3283 LLT Ty = MRI.getType(Res); 3284 3285 // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)). 3286 // First, check if G_FNEG is marked as Lower. If so, we may 3287 // end up with an infinite loop as G_FSUB is used to legalize G_FNEG. 3288 if (LI.getAction({G_FNEG, {Ty}}).Action == Lower) 3289 return UnableToLegalize; 3290 Register LHS = MI.getOperand(1).getReg(); 3291 Register RHS = MI.getOperand(2).getReg(); 3292 Register Neg = MRI.createGenericVirtualRegister(Ty); 3293 MIRBuilder.buildFNeg(Neg, RHS); 3294 MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags()); 3295 MI.eraseFromParent(); 3296 return Legalized; 3297 } 3298 case TargetOpcode::G_FMAD: 3299 return lowerFMad(MI); 3300 case TargetOpcode::G_FFLOOR: 3301 return lowerFFloor(MI); 3302 case TargetOpcode::G_INTRINSIC_ROUND: 3303 return lowerIntrinsicRound(MI); 3304 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: { 3305 // Since round even is the assumed rounding mode for unconstrained FP 3306 // operations, rint and roundeven are the same operation. 3307 changeOpcode(MI, TargetOpcode::G_FRINT); 3308 return Legalized; 3309 } 3310 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: { 3311 Register OldValRes = MI.getOperand(0).getReg(); 3312 Register SuccessRes = MI.getOperand(1).getReg(); 3313 Register Addr = MI.getOperand(2).getReg(); 3314 Register CmpVal = MI.getOperand(3).getReg(); 3315 Register NewVal = MI.getOperand(4).getReg(); 3316 MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal, 3317 **MI.memoperands_begin()); 3318 MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal); 3319 MI.eraseFromParent(); 3320 return Legalized; 3321 } 3322 case TargetOpcode::G_LOAD: 3323 case TargetOpcode::G_SEXTLOAD: 3324 case TargetOpcode::G_ZEXTLOAD: 3325 return lowerLoad(cast<GAnyLoad>(MI)); 3326 case TargetOpcode::G_STORE: 3327 return lowerStore(cast<GStore>(MI)); 3328 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 3329 case TargetOpcode::G_CTTZ_ZERO_UNDEF: 3330 case TargetOpcode::G_CTLZ: 3331 case TargetOpcode::G_CTTZ: 3332 case TargetOpcode::G_CTPOP: 3333 return lowerBitCount(MI); 3334 case G_UADDO: { 3335 Register Res = MI.getOperand(0).getReg(); 3336 Register CarryOut = MI.getOperand(1).getReg(); 3337 Register LHS = MI.getOperand(2).getReg(); 3338 Register RHS = MI.getOperand(3).getReg(); 3339 3340 MIRBuilder.buildAdd(Res, LHS, RHS); 3341 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS); 3342 3343 MI.eraseFromParent(); 3344 return Legalized; 3345 } 3346 case G_UADDE: { 3347 Register Res = MI.getOperand(0).getReg(); 3348 Register CarryOut = MI.getOperand(1).getReg(); 3349 Register LHS = MI.getOperand(2).getReg(); 3350 Register RHS = MI.getOperand(3).getReg(); 3351 Register CarryIn = MI.getOperand(4).getReg(); 3352 LLT Ty = MRI.getType(Res); 3353 3354 auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS); 3355 auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn); 3356 MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn); 3357 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS); 3358 3359 MI.eraseFromParent(); 3360 return Legalized; 3361 } 3362 case G_USUBO: { 3363 Register Res = MI.getOperand(0).getReg(); 3364 Register BorrowOut = MI.getOperand(1).getReg(); 3365 Register LHS = MI.getOperand(2).getReg(); 3366 Register RHS = MI.getOperand(3).getReg(); 3367 3368 MIRBuilder.buildSub(Res, LHS, RHS); 3369 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS); 3370 3371 MI.eraseFromParent(); 3372 return Legalized; 3373 } 3374 case G_USUBE: { 3375 Register Res = MI.getOperand(0).getReg(); 3376 Register BorrowOut = MI.getOperand(1).getReg(); 3377 Register LHS = MI.getOperand(2).getReg(); 3378 Register RHS = MI.getOperand(3).getReg(); 3379 Register BorrowIn = MI.getOperand(4).getReg(); 3380 const LLT CondTy = MRI.getType(BorrowOut); 3381 const LLT Ty = MRI.getType(Res); 3382 3383 auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS); 3384 auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn); 3385 MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn); 3386 3387 auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS); 3388 auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS); 3389 MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS); 3390 3391 MI.eraseFromParent(); 3392 return Legalized; 3393 } 3394 case G_UITOFP: 3395 return lowerUITOFP(MI); 3396 case G_SITOFP: 3397 return lowerSITOFP(MI); 3398 case G_FPTOUI: 3399 return lowerFPTOUI(MI); 3400 case G_FPTOSI: 3401 return lowerFPTOSI(MI); 3402 case G_FPTRUNC: 3403 return lowerFPTRUNC(MI); 3404 case G_FPOWI: 3405 return lowerFPOWI(MI); 3406 case G_SMIN: 3407 case G_SMAX: 3408 case G_UMIN: 3409 case G_UMAX: 3410 return lowerMinMax(MI); 3411 case G_FCOPYSIGN: 3412 return lowerFCopySign(MI); 3413 case G_FMINNUM: 3414 case G_FMAXNUM: 3415 return lowerFMinNumMaxNum(MI); 3416 case G_MERGE_VALUES: 3417 return lowerMergeValues(MI); 3418 case G_UNMERGE_VALUES: 3419 return lowerUnmergeValues(MI); 3420 case TargetOpcode::G_SEXT_INREG: { 3421 assert(MI.getOperand(2).isImm() && "Expected immediate"); 3422 int64_t SizeInBits = MI.getOperand(2).getImm(); 3423 3424 Register DstReg = MI.getOperand(0).getReg(); 3425 Register SrcReg = MI.getOperand(1).getReg(); 3426 LLT DstTy = MRI.getType(DstReg); 3427 Register TmpRes = MRI.createGenericVirtualRegister(DstTy); 3428 3429 auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits); 3430 MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0)); 3431 MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0)); 3432 MI.eraseFromParent(); 3433 return Legalized; 3434 } 3435 case G_EXTRACT_VECTOR_ELT: 3436 case G_INSERT_VECTOR_ELT: 3437 return lowerExtractInsertVectorElt(MI); 3438 case G_SHUFFLE_VECTOR: 3439 return lowerShuffleVector(MI); 3440 case G_DYN_STACKALLOC: 3441 return lowerDynStackAlloc(MI); 3442 case G_EXTRACT: 3443 return lowerExtract(MI); 3444 case G_INSERT: 3445 return lowerInsert(MI); 3446 case G_BSWAP: 3447 return lowerBswap(MI); 3448 case G_BITREVERSE: 3449 return lowerBitreverse(MI); 3450 case G_READ_REGISTER: 3451 case G_WRITE_REGISTER: 3452 return lowerReadWriteRegister(MI); 3453 case G_UADDSAT: 3454 case G_USUBSAT: { 3455 // Try to make a reasonable guess about which lowering strategy to use. The 3456 // target can override this with custom lowering and calling the 3457 // implementation functions. 3458 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3459 if (LI.isLegalOrCustom({G_UMIN, Ty})) 3460 return lowerAddSubSatToMinMax(MI); 3461 return lowerAddSubSatToAddoSubo(MI); 3462 } 3463 case G_SADDSAT: 3464 case G_SSUBSAT: { 3465 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3466 3467 // FIXME: It would probably make more sense to see if G_SADDO is preferred, 3468 // since it's a shorter expansion. However, we would need to figure out the 3469 // preferred boolean type for the carry out for the query. 3470 if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty})) 3471 return lowerAddSubSatToMinMax(MI); 3472 return lowerAddSubSatToAddoSubo(MI); 3473 } 3474 case G_SSHLSAT: 3475 case G_USHLSAT: 3476 return lowerShlSat(MI); 3477 case G_ABS: 3478 return lowerAbsToAddXor(MI); 3479 case G_SELECT: 3480 return lowerSelect(MI); 3481 case G_SDIVREM: 3482 case G_UDIVREM: 3483 return lowerDIVREM(MI); 3484 case G_FSHL: 3485 case G_FSHR: 3486 return lowerFunnelShift(MI); 3487 case G_ROTL: 3488 case G_ROTR: 3489 return lowerRotate(MI); 3490 case G_MEMSET: 3491 case G_MEMCPY: 3492 case G_MEMMOVE: 3493 return lowerMemCpyFamily(MI); 3494 case G_MEMCPY_INLINE: 3495 return lowerMemcpyInline(MI); 3496 GISEL_VECREDUCE_CASES_NONSEQ 3497 return lowerVectorReduction(MI); 3498 } 3499 } 3500 3501 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty, 3502 Align MinAlign) const { 3503 // FIXME: We're missing a way to go back from LLT to llvm::Type to query the 3504 // datalayout for the preferred alignment. Also there should be a target hook 3505 // for this to allow targets to reduce the alignment and ignore the 3506 // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of 3507 // the type. 3508 return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign); 3509 } 3510 3511 MachineInstrBuilder 3512 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment, 3513 MachinePointerInfo &PtrInfo) { 3514 MachineFunction &MF = MIRBuilder.getMF(); 3515 const DataLayout &DL = MIRBuilder.getDataLayout(); 3516 int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false); 3517 3518 unsigned AddrSpace = DL.getAllocaAddrSpace(); 3519 LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)); 3520 3521 PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx); 3522 return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx); 3523 } 3524 3525 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg, 3526 LLT VecTy) { 3527 int64_t IdxVal; 3528 if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal))) 3529 return IdxReg; 3530 3531 LLT IdxTy = B.getMRI()->getType(IdxReg); 3532 unsigned NElts = VecTy.getNumElements(); 3533 if (isPowerOf2_32(NElts)) { 3534 APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts)); 3535 return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0); 3536 } 3537 3538 return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1)) 3539 .getReg(0); 3540 } 3541 3542 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy, 3543 Register Index) { 3544 LLT EltTy = VecTy.getElementType(); 3545 3546 // Calculate the element offset and add it to the pointer. 3547 unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size. 3548 assert(EltSize * 8 == EltTy.getSizeInBits() && 3549 "Converting bits to bytes lost precision"); 3550 3551 Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy); 3552 3553 LLT IdxTy = MRI.getType(Index); 3554 auto Mul = MIRBuilder.buildMul(IdxTy, Index, 3555 MIRBuilder.buildConstant(IdxTy, EltSize)); 3556 3557 LLT PtrTy = MRI.getType(VecPtr); 3558 return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0); 3559 } 3560 3561 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef( 3562 MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) { 3563 Register DstReg = MI.getOperand(0).getReg(); 3564 LLT DstTy = MRI.getType(DstReg); 3565 LLT LCMTy = getLCMType(DstTy, NarrowTy); 3566 3567 unsigned NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits(); 3568 3569 auto NewUndef = MIRBuilder.buildUndef(NarrowTy); 3570 SmallVector<Register, 8> Parts(NumParts, NewUndef.getReg(0)); 3571 3572 buildWidenedRemergeToDst(DstReg, LCMTy, Parts); 3573 MI.eraseFromParent(); 3574 return Legalized; 3575 } 3576 3577 // Handle splitting vector operations which need to have the same number of 3578 // elements in each type index, but each type index may have a different element 3579 // type. 3580 // 3581 // e.g. <4 x s64> = G_SHL <4 x s64>, <4 x s32> -> 3582 // <2 x s64> = G_SHL <2 x s64>, <2 x s32> 3583 // <2 x s64> = G_SHL <2 x s64>, <2 x s32> 3584 // 3585 // Also handles some irregular breakdown cases, e.g. 3586 // e.g. <3 x s64> = G_SHL <3 x s64>, <3 x s32> -> 3587 // <2 x s64> = G_SHL <2 x s64>, <2 x s32> 3588 // s64 = G_SHL s64, s32 3589 LegalizerHelper::LegalizeResult 3590 LegalizerHelper::fewerElementsVectorMultiEltType( 3591 MachineInstr &MI, unsigned TypeIdx, LLT NarrowTyArg) { 3592 if (TypeIdx != 0) 3593 return UnableToLegalize; 3594 3595 const LLT NarrowTy0 = NarrowTyArg; 3596 const Register DstReg = MI.getOperand(0).getReg(); 3597 LLT DstTy = MRI.getType(DstReg); 3598 LLT LeftoverTy0; 3599 3600 // All of the operands need to have the same number of elements, so if we can 3601 // determine a type breakdown for the result type, we can for all of the 3602 // source types. 3603 int NumParts = getNarrowTypeBreakDown(DstTy, NarrowTy0, LeftoverTy0).first; 3604 if (NumParts < 0) 3605 return UnableToLegalize; 3606 3607 SmallVector<MachineInstrBuilder, 4> NewInsts; 3608 3609 SmallVector<Register, 4> DstRegs, LeftoverDstRegs; 3610 SmallVector<Register, 4> PartRegs, LeftoverRegs; 3611 3612 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) { 3613 Register SrcReg = MI.getOperand(I).getReg(); 3614 LLT SrcTyI = MRI.getType(SrcReg); 3615 const auto NewEC = NarrowTy0.isVector() ? NarrowTy0.getElementCount() 3616 : ElementCount::getFixed(1); 3617 LLT NarrowTyI = LLT::scalarOrVector(NewEC, SrcTyI.getScalarType()); 3618 LLT LeftoverTyI; 3619 3620 // Split this operand into the requested typed registers, and any leftover 3621 // required to reproduce the original type. 3622 if (!extractParts(SrcReg, SrcTyI, NarrowTyI, LeftoverTyI, PartRegs, 3623 LeftoverRegs)) 3624 return UnableToLegalize; 3625 3626 if (I == 1) { 3627 // For the first operand, create an instruction for each part and setup 3628 // the result. 3629 for (Register PartReg : PartRegs) { 3630 Register PartDstReg = MRI.createGenericVirtualRegister(NarrowTy0); 3631 NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode()) 3632 .addDef(PartDstReg) 3633 .addUse(PartReg)); 3634 DstRegs.push_back(PartDstReg); 3635 } 3636 3637 for (Register LeftoverReg : LeftoverRegs) { 3638 Register PartDstReg = MRI.createGenericVirtualRegister(LeftoverTy0); 3639 NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode()) 3640 .addDef(PartDstReg) 3641 .addUse(LeftoverReg)); 3642 LeftoverDstRegs.push_back(PartDstReg); 3643 } 3644 } else { 3645 assert(NewInsts.size() == PartRegs.size() + LeftoverRegs.size()); 3646 3647 // Add the newly created operand splits to the existing instructions. The 3648 // odd-sized pieces are ordered after the requested NarrowTyArg sized 3649 // pieces. 3650 unsigned InstCount = 0; 3651 for (unsigned J = 0, JE = PartRegs.size(); J != JE; ++J) 3652 NewInsts[InstCount++].addUse(PartRegs[J]); 3653 for (unsigned J = 0, JE = LeftoverRegs.size(); J != JE; ++J) 3654 NewInsts[InstCount++].addUse(LeftoverRegs[J]); 3655 } 3656 3657 PartRegs.clear(); 3658 LeftoverRegs.clear(); 3659 } 3660 3661 // Insert the newly built operations and rebuild the result register. 3662 for (auto &MIB : NewInsts) 3663 MIRBuilder.insertInstr(MIB); 3664 3665 insertParts(DstReg, DstTy, NarrowTy0, DstRegs, LeftoverTy0, LeftoverDstRegs); 3666 3667 MI.eraseFromParent(); 3668 return Legalized; 3669 } 3670 3671 LegalizerHelper::LegalizeResult 3672 LegalizerHelper::fewerElementsVectorCasts(MachineInstr &MI, unsigned TypeIdx, 3673 LLT NarrowTy) { 3674 if (TypeIdx != 0) 3675 return UnableToLegalize; 3676 3677 Register DstReg = MI.getOperand(0).getReg(); 3678 Register SrcReg = MI.getOperand(1).getReg(); 3679 LLT DstTy = MRI.getType(DstReg); 3680 LLT SrcTy = MRI.getType(SrcReg); 3681 3682 LLT NarrowTy0 = NarrowTy; 3683 LLT NarrowTy1; 3684 unsigned NumParts; 3685 3686 if (NarrowTy.isVector()) { 3687 // Uneven breakdown not handled. 3688 NumParts = DstTy.getNumElements() / NarrowTy.getNumElements(); 3689 if (NumParts * NarrowTy.getNumElements() != DstTy.getNumElements()) 3690 return UnableToLegalize; 3691 3692 NarrowTy1 = LLT::vector(NarrowTy.getElementCount(), SrcTy.getElementType()); 3693 } else { 3694 NumParts = DstTy.getNumElements(); 3695 NarrowTy1 = SrcTy.getElementType(); 3696 } 3697 3698 SmallVector<Register, 4> SrcRegs, DstRegs; 3699 extractParts(SrcReg, NarrowTy1, NumParts, SrcRegs); 3700 3701 for (unsigned I = 0; I < NumParts; ++I) { 3702 Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0); 3703 MachineInstr *NewInst = 3704 MIRBuilder.buildInstr(MI.getOpcode(), {DstReg}, {SrcRegs[I]}); 3705 3706 NewInst->setFlags(MI.getFlags()); 3707 DstRegs.push_back(DstReg); 3708 } 3709 3710 if (NarrowTy.isVector()) 3711 MIRBuilder.buildConcatVectors(DstReg, DstRegs); 3712 else 3713 MIRBuilder.buildBuildVector(DstReg, DstRegs); 3714 3715 MI.eraseFromParent(); 3716 return Legalized; 3717 } 3718 3719 LegalizerHelper::LegalizeResult 3720 LegalizerHelper::fewerElementsVectorCmp(MachineInstr &MI, unsigned TypeIdx, 3721 LLT NarrowTy) { 3722 Register DstReg = MI.getOperand(0).getReg(); 3723 Register Src0Reg = MI.getOperand(2).getReg(); 3724 LLT DstTy = MRI.getType(DstReg); 3725 LLT SrcTy = MRI.getType(Src0Reg); 3726 3727 unsigned NumParts; 3728 LLT NarrowTy0, NarrowTy1; 3729 3730 if (TypeIdx == 0) { 3731 unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1; 3732 unsigned OldElts = DstTy.getNumElements(); 3733 3734 NarrowTy0 = NarrowTy; 3735 NumParts = NarrowTy.isVector() ? (OldElts / NewElts) : DstTy.getNumElements(); 3736 NarrowTy1 = NarrowTy.isVector() ? LLT::vector(NarrowTy.getElementCount(), 3737 SrcTy.getScalarSizeInBits()) 3738 : SrcTy.getElementType(); 3739 3740 } else { 3741 unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1; 3742 unsigned OldElts = SrcTy.getNumElements(); 3743 3744 NumParts = NarrowTy.isVector() ? (OldElts / NewElts) : 3745 NarrowTy.getNumElements(); 3746 NarrowTy0 = 3747 LLT::vector(NarrowTy.getElementCount(), DstTy.getScalarSizeInBits()); 3748 NarrowTy1 = NarrowTy; 3749 } 3750 3751 // FIXME: Don't know how to handle the situation where the small vectors 3752 // aren't all the same size yet. 3753 if (NarrowTy1.isVector() && 3754 NarrowTy1.getNumElements() * NumParts != DstTy.getNumElements()) 3755 return UnableToLegalize; 3756 3757 CmpInst::Predicate Pred 3758 = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 3759 3760 SmallVector<Register, 2> Src1Regs, Src2Regs, DstRegs; 3761 extractParts(MI.getOperand(2).getReg(), NarrowTy1, NumParts, Src1Regs); 3762 extractParts(MI.getOperand(3).getReg(), NarrowTy1, NumParts, Src2Regs); 3763 3764 for (unsigned I = 0; I < NumParts; ++I) { 3765 Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0); 3766 DstRegs.push_back(DstReg); 3767 3768 if (MI.getOpcode() == TargetOpcode::G_ICMP) 3769 MIRBuilder.buildICmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]); 3770 else { 3771 MachineInstr *NewCmp 3772 = MIRBuilder.buildFCmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]); 3773 NewCmp->setFlags(MI.getFlags()); 3774 } 3775 } 3776 3777 if (NarrowTy1.isVector()) 3778 MIRBuilder.buildConcatVectors(DstReg, DstRegs); 3779 else 3780 MIRBuilder.buildBuildVector(DstReg, DstRegs); 3781 3782 MI.eraseFromParent(); 3783 return Legalized; 3784 } 3785 3786 LegalizerHelper::LegalizeResult 3787 LegalizerHelper::fewerElementsVectorSelect(MachineInstr &MI, unsigned TypeIdx, 3788 LLT NarrowTy) { 3789 Register DstReg = MI.getOperand(0).getReg(); 3790 Register CondReg = MI.getOperand(1).getReg(); 3791 3792 unsigned NumParts = 0; 3793 LLT NarrowTy0, NarrowTy1; 3794 3795 LLT DstTy = MRI.getType(DstReg); 3796 LLT CondTy = MRI.getType(CondReg); 3797 unsigned Size = DstTy.getSizeInBits(); 3798 3799 assert(TypeIdx == 0 || CondTy.isVector()); 3800 3801 if (TypeIdx == 0) { 3802 NarrowTy0 = NarrowTy; 3803 NarrowTy1 = CondTy; 3804 3805 unsigned NarrowSize = NarrowTy0.getSizeInBits(); 3806 // FIXME: Don't know how to handle the situation where the small vectors 3807 // aren't all the same size yet. 3808 if (Size % NarrowSize != 0) 3809 return UnableToLegalize; 3810 3811 NumParts = Size / NarrowSize; 3812 3813 // Need to break down the condition type 3814 if (CondTy.isVector()) { 3815 if (CondTy.getNumElements() == NumParts) 3816 NarrowTy1 = CondTy.getElementType(); 3817 else 3818 NarrowTy1 = 3819 LLT::vector(CondTy.getElementCount().divideCoefficientBy(NumParts), 3820 CondTy.getScalarSizeInBits()); 3821 } 3822 } else { 3823 NumParts = CondTy.getNumElements(); 3824 if (NarrowTy.isVector()) { 3825 // TODO: Handle uneven breakdown. 3826 if (NumParts * NarrowTy.getNumElements() != CondTy.getNumElements()) 3827 return UnableToLegalize; 3828 3829 return UnableToLegalize; 3830 } else { 3831 NarrowTy0 = DstTy.getElementType(); 3832 NarrowTy1 = NarrowTy; 3833 } 3834 } 3835 3836 SmallVector<Register, 2> DstRegs, Src0Regs, Src1Regs, Src2Regs; 3837 if (CondTy.isVector()) 3838 extractParts(MI.getOperand(1).getReg(), NarrowTy1, NumParts, Src0Regs); 3839 3840 extractParts(MI.getOperand(2).getReg(), NarrowTy0, NumParts, Src1Regs); 3841 extractParts(MI.getOperand(3).getReg(), NarrowTy0, NumParts, Src2Regs); 3842 3843 for (unsigned i = 0; i < NumParts; ++i) { 3844 Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0); 3845 MIRBuilder.buildSelect(DstReg, CondTy.isVector() ? Src0Regs[i] : CondReg, 3846 Src1Regs[i], Src2Regs[i]); 3847 DstRegs.push_back(DstReg); 3848 } 3849 3850 if (NarrowTy0.isVector()) 3851 MIRBuilder.buildConcatVectors(DstReg, DstRegs); 3852 else 3853 MIRBuilder.buildBuildVector(DstReg, DstRegs); 3854 3855 MI.eraseFromParent(); 3856 return Legalized; 3857 } 3858 3859 LegalizerHelper::LegalizeResult 3860 LegalizerHelper::fewerElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx, 3861 LLT NarrowTy) { 3862 const Register DstReg = MI.getOperand(0).getReg(); 3863 LLT PhiTy = MRI.getType(DstReg); 3864 LLT LeftoverTy; 3865 3866 // All of the operands need to have the same number of elements, so if we can 3867 // determine a type breakdown for the result type, we can for all of the 3868 // source types. 3869 int NumParts, NumLeftover; 3870 std::tie(NumParts, NumLeftover) 3871 = getNarrowTypeBreakDown(PhiTy, NarrowTy, LeftoverTy); 3872 if (NumParts < 0) 3873 return UnableToLegalize; 3874 3875 SmallVector<Register, 4> DstRegs, LeftoverDstRegs; 3876 SmallVector<MachineInstrBuilder, 4> NewInsts; 3877 3878 const int TotalNumParts = NumParts + NumLeftover; 3879 3880 // Insert the new phis in the result block first. 3881 for (int I = 0; I != TotalNumParts; ++I) { 3882 LLT Ty = I < NumParts ? NarrowTy : LeftoverTy; 3883 Register PartDstReg = MRI.createGenericVirtualRegister(Ty); 3884 NewInsts.push_back(MIRBuilder.buildInstr(TargetOpcode::G_PHI) 3885 .addDef(PartDstReg)); 3886 if (I < NumParts) 3887 DstRegs.push_back(PartDstReg); 3888 else 3889 LeftoverDstRegs.push_back(PartDstReg); 3890 } 3891 3892 MachineBasicBlock *MBB = MI.getParent(); 3893 MIRBuilder.setInsertPt(*MBB, MBB->getFirstNonPHI()); 3894 insertParts(DstReg, PhiTy, NarrowTy, DstRegs, LeftoverTy, LeftoverDstRegs); 3895 3896 SmallVector<Register, 4> PartRegs, LeftoverRegs; 3897 3898 // Insert code to extract the incoming values in each predecessor block. 3899 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3900 PartRegs.clear(); 3901 LeftoverRegs.clear(); 3902 3903 Register SrcReg = MI.getOperand(I).getReg(); 3904 MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB(); 3905 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator()); 3906 3907 LLT Unused; 3908 if (!extractParts(SrcReg, PhiTy, NarrowTy, Unused, PartRegs, 3909 LeftoverRegs)) 3910 return UnableToLegalize; 3911 3912 // Add the newly created operand splits to the existing instructions. The 3913 // odd-sized pieces are ordered after the requested NarrowTyArg sized 3914 // pieces. 3915 for (int J = 0; J != TotalNumParts; ++J) { 3916 MachineInstrBuilder MIB = NewInsts[J]; 3917 MIB.addUse(J < NumParts ? PartRegs[J] : LeftoverRegs[J - NumParts]); 3918 MIB.addMBB(&OpMBB); 3919 } 3920 } 3921 3922 MI.eraseFromParent(); 3923 return Legalized; 3924 } 3925 3926 LegalizerHelper::LegalizeResult 3927 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI, 3928 unsigned TypeIdx, 3929 LLT NarrowTy) { 3930 if (TypeIdx != 1) 3931 return UnableToLegalize; 3932 3933 const int NumDst = MI.getNumOperands() - 1; 3934 const Register SrcReg = MI.getOperand(NumDst).getReg(); 3935 LLT SrcTy = MRI.getType(SrcReg); 3936 3937 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 3938 3939 // TODO: Create sequence of extracts. 3940 if (DstTy == NarrowTy) 3941 return UnableToLegalize; 3942 3943 LLT GCDTy = getGCDType(SrcTy, NarrowTy); 3944 if (DstTy == GCDTy) { 3945 // This would just be a copy of the same unmerge. 3946 // TODO: Create extracts, pad with undef and create intermediate merges. 3947 return UnableToLegalize; 3948 } 3949 3950 auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg); 3951 const int NumUnmerge = Unmerge->getNumOperands() - 1; 3952 const int PartsPerUnmerge = NumDst / NumUnmerge; 3953 3954 for (int I = 0; I != NumUnmerge; ++I) { 3955 auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES); 3956 3957 for (int J = 0; J != PartsPerUnmerge; ++J) 3958 MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg()); 3959 MIB.addUse(Unmerge.getReg(I)); 3960 } 3961 3962 MI.eraseFromParent(); 3963 return Legalized; 3964 } 3965 3966 LegalizerHelper::LegalizeResult 3967 LegalizerHelper::fewerElementsVectorMulo(MachineInstr &MI, unsigned TypeIdx, 3968 LLT NarrowTy) { 3969 Register Result = MI.getOperand(0).getReg(); 3970 Register Overflow = MI.getOperand(1).getReg(); 3971 Register LHS = MI.getOperand(2).getReg(); 3972 Register RHS = MI.getOperand(3).getReg(); 3973 3974 LLT SrcTy = MRI.getType(LHS); 3975 if (!SrcTy.isVector()) 3976 return UnableToLegalize; 3977 3978 LLT ElementType = SrcTy.getElementType(); 3979 LLT OverflowElementTy = MRI.getType(Overflow).getElementType(); 3980 const ElementCount NumResult = SrcTy.getElementCount(); 3981 LLT GCDTy = getGCDType(SrcTy, NarrowTy); 3982 3983 // Unmerge the operands to smaller parts of GCD type. 3984 auto UnmergeLHS = MIRBuilder.buildUnmerge(GCDTy, LHS); 3985 auto UnmergeRHS = MIRBuilder.buildUnmerge(GCDTy, RHS); 3986 3987 const int NumOps = UnmergeLHS->getNumOperands() - 1; 3988 const ElementCount PartsPerUnmerge = NumResult.divideCoefficientBy(NumOps); 3989 LLT OverflowTy = LLT::scalarOrVector(PartsPerUnmerge, OverflowElementTy); 3990 LLT ResultTy = LLT::scalarOrVector(PartsPerUnmerge, ElementType); 3991 3992 // Perform the operation over unmerged parts. 3993 SmallVector<Register, 8> ResultParts; 3994 SmallVector<Register, 8> OverflowParts; 3995 for (int I = 0; I != NumOps; ++I) { 3996 Register Operand1 = UnmergeLHS->getOperand(I).getReg(); 3997 Register Operand2 = UnmergeRHS->getOperand(I).getReg(); 3998 auto PartMul = MIRBuilder.buildInstr(MI.getOpcode(), {ResultTy, OverflowTy}, 3999 {Operand1, Operand2}); 4000 ResultParts.push_back(PartMul->getOperand(0).getReg()); 4001 OverflowParts.push_back(PartMul->getOperand(1).getReg()); 4002 } 4003 4004 LLT ResultLCMTy = buildLCMMergePieces(SrcTy, NarrowTy, GCDTy, ResultParts); 4005 LLT OverflowLCMTy = 4006 LLT::scalarOrVector(ResultLCMTy.getElementCount(), OverflowElementTy); 4007 4008 // Recombine the pieces to the original result and overflow registers. 4009 buildWidenedRemergeToDst(Result, ResultLCMTy, ResultParts); 4010 buildWidenedRemergeToDst(Overflow, OverflowLCMTy, OverflowParts); 4011 MI.eraseFromParent(); 4012 return Legalized; 4013 } 4014 4015 // Handle FewerElementsVector a G_BUILD_VECTOR or G_CONCAT_VECTORS that produces 4016 // a vector 4017 // 4018 // Create a G_BUILD_VECTOR or G_CONCAT_VECTORS of NarrowTy pieces, padding with 4019 // undef as necessary. 4020 // 4021 // %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2 4022 // -> <2 x s16> 4023 // 4024 // %4:_(s16) = G_IMPLICIT_DEF 4025 // %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1 4026 // %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4 4027 // %7:_(<2 x s16>) = G_IMPLICIT_DEF 4028 // %8:_(<6 x s16>) = G_CONCAT_VECTORS %5, %6, %7 4029 // %3:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %8 4030 LegalizerHelper::LegalizeResult 4031 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx, 4032 LLT NarrowTy) { 4033 Register DstReg = MI.getOperand(0).getReg(); 4034 LLT DstTy = MRI.getType(DstReg); 4035 LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); 4036 LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy); 4037 4038 // Break into a common type 4039 SmallVector<Register, 16> Parts; 4040 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) 4041 extractGCDType(Parts, GCDTy, MI.getOperand(I).getReg()); 4042 4043 // Build the requested new merge, padding with undef. 4044 LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, 4045 TargetOpcode::G_ANYEXT); 4046 4047 // Pack into the original result register. 4048 buildWidenedRemergeToDst(DstReg, LCMTy, Parts); 4049 4050 MI.eraseFromParent(); 4051 return Legalized; 4052 } 4053 4054 LegalizerHelper::LegalizeResult 4055 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI, 4056 unsigned TypeIdx, 4057 LLT NarrowVecTy) { 4058 Register DstReg = MI.getOperand(0).getReg(); 4059 Register SrcVec = MI.getOperand(1).getReg(); 4060 Register InsertVal; 4061 bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT; 4062 4063 assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index"); 4064 if (IsInsert) 4065 InsertVal = MI.getOperand(2).getReg(); 4066 4067 Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg(); 4068 4069 // TODO: Handle total scalarization case. 4070 if (!NarrowVecTy.isVector()) 4071 return UnableToLegalize; 4072 4073 LLT VecTy = MRI.getType(SrcVec); 4074 4075 // If the index is a constant, we can really break this down as you would 4076 // expect, and index into the target size pieces. 4077 int64_t IdxVal; 4078 auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI); 4079 if (MaybeCst) { 4080 IdxVal = MaybeCst->Value.getSExtValue(); 4081 // Avoid out of bounds indexing the pieces. 4082 if (IdxVal >= VecTy.getNumElements()) { 4083 MIRBuilder.buildUndef(DstReg); 4084 MI.eraseFromParent(); 4085 return Legalized; 4086 } 4087 4088 SmallVector<Register, 8> VecParts; 4089 LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec); 4090 4091 // Build a sequence of NarrowTy pieces in VecParts for this operand. 4092 LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts, 4093 TargetOpcode::G_ANYEXT); 4094 4095 unsigned NewNumElts = NarrowVecTy.getNumElements(); 4096 4097 LLT IdxTy = MRI.getType(Idx); 4098 int64_t PartIdx = IdxVal / NewNumElts; 4099 auto NewIdx = 4100 MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx); 4101 4102 if (IsInsert) { 4103 LLT PartTy = MRI.getType(VecParts[PartIdx]); 4104 4105 // Use the adjusted index to insert into one of the subvectors. 4106 auto InsertPart = MIRBuilder.buildInsertVectorElement( 4107 PartTy, VecParts[PartIdx], InsertVal, NewIdx); 4108 VecParts[PartIdx] = InsertPart.getReg(0); 4109 4110 // Recombine the inserted subvector with the others to reform the result 4111 // vector. 4112 buildWidenedRemergeToDst(DstReg, LCMTy, VecParts); 4113 } else { 4114 MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx); 4115 } 4116 4117 MI.eraseFromParent(); 4118 return Legalized; 4119 } 4120 4121 // With a variable index, we can't perform the operation in a smaller type, so 4122 // we're forced to expand this. 4123 // 4124 // TODO: We could emit a chain of compare/select to figure out which piece to 4125 // index. 4126 return lowerExtractInsertVectorElt(MI); 4127 } 4128 4129 LegalizerHelper::LegalizeResult 4130 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx, 4131 LLT NarrowTy) { 4132 // FIXME: Don't know how to handle secondary types yet. 4133 if (TypeIdx != 0) 4134 return UnableToLegalize; 4135 4136 // This implementation doesn't work for atomics. Give up instead of doing 4137 // something invalid. 4138 if (LdStMI.isAtomic()) 4139 return UnableToLegalize; 4140 4141 bool IsLoad = isa<GLoad>(LdStMI); 4142 Register ValReg = LdStMI.getReg(0); 4143 Register AddrReg = LdStMI.getPointerReg(); 4144 LLT ValTy = MRI.getType(ValReg); 4145 4146 // FIXME: Do we need a distinct NarrowMemory legalize action? 4147 if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) { 4148 LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n"); 4149 return UnableToLegalize; 4150 } 4151 4152 int NumParts = -1; 4153 int NumLeftover = -1; 4154 LLT LeftoverTy; 4155 SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs; 4156 if (IsLoad) { 4157 std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy); 4158 } else { 4159 if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs, 4160 NarrowLeftoverRegs)) { 4161 NumParts = NarrowRegs.size(); 4162 NumLeftover = NarrowLeftoverRegs.size(); 4163 } 4164 } 4165 4166 if (NumParts == -1) 4167 return UnableToLegalize; 4168 4169 LLT PtrTy = MRI.getType(AddrReg); 4170 const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); 4171 4172 unsigned TotalSize = ValTy.getSizeInBits(); 4173 4174 // Split the load/store into PartTy sized pieces starting at Offset. If this 4175 // is a load, return the new registers in ValRegs. For a store, each elements 4176 // of ValRegs should be PartTy. Returns the next offset that needs to be 4177 // handled. 4178 auto MMO = LdStMI.getMMO(); 4179 auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs, 4180 unsigned Offset) -> unsigned { 4181 MachineFunction &MF = MIRBuilder.getMF(); 4182 unsigned PartSize = PartTy.getSizeInBits(); 4183 for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize; 4184 Offset += PartSize, ++Idx) { 4185 unsigned ByteOffset = Offset / 8; 4186 Register NewAddrReg; 4187 4188 MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset); 4189 4190 MachineMemOperand *NewMMO = 4191 MF.getMachineMemOperand(&MMO, ByteOffset, PartTy); 4192 4193 if (IsLoad) { 4194 Register Dst = MRI.createGenericVirtualRegister(PartTy); 4195 ValRegs.push_back(Dst); 4196 MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO); 4197 } else { 4198 MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO); 4199 } 4200 } 4201 4202 return Offset; 4203 }; 4204 4205 unsigned HandledOffset = splitTypePieces(NarrowTy, NarrowRegs, 0); 4206 4207 // Handle the rest of the register if this isn't an even type breakdown. 4208 if (LeftoverTy.isValid()) 4209 splitTypePieces(LeftoverTy, NarrowLeftoverRegs, HandledOffset); 4210 4211 if (IsLoad) { 4212 insertParts(ValReg, ValTy, NarrowTy, NarrowRegs, 4213 LeftoverTy, NarrowLeftoverRegs); 4214 } 4215 4216 LdStMI.eraseFromParent(); 4217 return Legalized; 4218 } 4219 4220 LegalizerHelper::LegalizeResult 4221 LegalizerHelper::reduceOperationWidth(MachineInstr &MI, unsigned int TypeIdx, 4222 LLT NarrowTy) { 4223 assert(TypeIdx == 0 && "only one type index expected"); 4224 4225 const unsigned Opc = MI.getOpcode(); 4226 const int NumDefOps = MI.getNumExplicitDefs(); 4227 const int NumSrcOps = MI.getNumOperands() - NumDefOps; 4228 const unsigned Flags = MI.getFlags(); 4229 const unsigned NarrowSize = NarrowTy.getSizeInBits(); 4230 const LLT NarrowScalarTy = LLT::scalar(NarrowSize); 4231 4232 assert(MI.getNumOperands() <= 4 && "expected instruction with either 1 " 4233 "result and 1-3 sources or 2 results and " 4234 "1-2 sources"); 4235 4236 SmallVector<Register, 2> DstRegs; 4237 for (int I = 0; I < NumDefOps; ++I) 4238 DstRegs.push_back(MI.getOperand(I).getReg()); 4239 4240 // First of all check whether we are narrowing (changing the element type) 4241 // or reducing the vector elements 4242 const LLT DstTy = MRI.getType(DstRegs[0]); 4243 const bool IsNarrow = NarrowTy.getScalarType() != DstTy.getScalarType(); 4244 4245 SmallVector<Register, 8> ExtractedRegs[3]; 4246 SmallVector<Register, 8> Parts; 4247 4248 // Break down all the sources into NarrowTy pieces we can operate on. This may 4249 // involve creating merges to a wider type, padded with undef. 4250 for (int I = 0; I != NumSrcOps; ++I) { 4251 Register SrcReg = MI.getOperand(I + NumDefOps).getReg(); 4252 LLT SrcTy = MRI.getType(SrcReg); 4253 4254 // The type to narrow SrcReg to. For narrowing, this is a smaller scalar. 4255 // For fewerElements, this is a smaller vector with the same element type. 4256 LLT OpNarrowTy; 4257 if (IsNarrow) { 4258 OpNarrowTy = NarrowScalarTy; 4259 4260 // In case of narrowing, we need to cast vectors to scalars for this to 4261 // work properly 4262 // FIXME: Can we do without the bitcast here if we're narrowing? 4263 if (SrcTy.isVector()) { 4264 SrcTy = LLT::scalar(SrcTy.getSizeInBits()); 4265 SrcReg = MIRBuilder.buildBitcast(SrcTy, SrcReg).getReg(0); 4266 } 4267 } else { 4268 auto NarrowEC = NarrowTy.isVector() ? NarrowTy.getElementCount() 4269 : ElementCount::getFixed(1); 4270 OpNarrowTy = LLT::scalarOrVector(NarrowEC, SrcTy.getScalarType()); 4271 } 4272 4273 LLT GCDTy = extractGCDType(ExtractedRegs[I], SrcTy, OpNarrowTy, SrcReg); 4274 4275 // Build a sequence of NarrowTy pieces in ExtractedRegs for this operand. 4276 buildLCMMergePieces(SrcTy, OpNarrowTy, GCDTy, ExtractedRegs[I], 4277 TargetOpcode::G_ANYEXT); 4278 } 4279 4280 SmallVector<Register, 8> ResultRegs[2]; 4281 4282 // Input operands for each sub-instruction. 4283 SmallVector<SrcOp, 4> InputRegs(NumSrcOps, Register()); 4284 4285 int NumParts = ExtractedRegs[0].size(); 4286 const unsigned DstSize = DstTy.getSizeInBits(); 4287 const LLT DstScalarTy = LLT::scalar(DstSize); 4288 4289 // Narrowing needs to use scalar types 4290 LLT DstLCMTy, NarrowDstTy; 4291 if (IsNarrow) { 4292 DstLCMTy = getLCMType(DstScalarTy, NarrowScalarTy); 4293 NarrowDstTy = NarrowScalarTy; 4294 } else { 4295 DstLCMTy = getLCMType(DstTy, NarrowTy); 4296 NarrowDstTy = NarrowTy; 4297 } 4298 4299 // We widened the source registers to satisfy merge/unmerge size 4300 // constraints. We'll have some extra fully undef parts. 4301 const int NumRealParts = (DstSize + NarrowSize - 1) / NarrowSize; 4302 4303 for (int I = 0; I != NumRealParts; ++I) { 4304 // Emit this instruction on each of the split pieces. 4305 for (int J = 0; J != NumSrcOps; ++J) 4306 InputRegs[J] = ExtractedRegs[J][I]; 4307 4308 MachineInstrBuilder Inst; 4309 if (NumDefOps == 1) 4310 Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy}, InputRegs, Flags); 4311 else 4312 Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy, NarrowDstTy}, InputRegs, 4313 Flags); 4314 4315 for (int J = 0; J != NumDefOps; ++J) 4316 ResultRegs[J].push_back(Inst.getReg(J)); 4317 } 4318 4319 // Fill out the widened result with undef instead of creating instructions 4320 // with undef inputs. 4321 int NumUndefParts = NumParts - NumRealParts; 4322 if (NumUndefParts != 0) { 4323 Register Undef = MIRBuilder.buildUndef(NarrowDstTy).getReg(0); 4324 for (int I = 0; I != NumDefOps; ++I) 4325 ResultRegs[I].append(NumUndefParts, Undef); 4326 } 4327 4328 // Extract the possibly padded result. Use a scratch register if we need to do 4329 // a final bitcast, otherwise use the original result register. 4330 Register MergeDstReg; 4331 for (int I = 0; I != NumDefOps; ++I) { 4332 if (IsNarrow && DstTy.isVector()) 4333 MergeDstReg = MRI.createGenericVirtualRegister(DstScalarTy); 4334 else 4335 MergeDstReg = DstRegs[I]; 4336 4337 buildWidenedRemergeToDst(MergeDstReg, DstLCMTy, ResultRegs[I]); 4338 4339 // Recast to vector if we narrowed a vector 4340 if (IsNarrow && DstTy.isVector()) 4341 MIRBuilder.buildBitcast(DstRegs[I], MergeDstReg); 4342 } 4343 4344 MI.eraseFromParent(); 4345 return Legalized; 4346 } 4347 4348 LegalizerHelper::LegalizeResult 4349 LegalizerHelper::fewerElementsVectorSextInReg(MachineInstr &MI, unsigned TypeIdx, 4350 LLT NarrowTy) { 4351 Register DstReg = MI.getOperand(0).getReg(); 4352 Register SrcReg = MI.getOperand(1).getReg(); 4353 int64_t Imm = MI.getOperand(2).getImm(); 4354 4355 LLT DstTy = MRI.getType(DstReg); 4356 4357 SmallVector<Register, 8> Parts; 4358 LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg); 4359 LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts); 4360 4361 for (Register &R : Parts) 4362 R = MIRBuilder.buildSExtInReg(NarrowTy, R, Imm).getReg(0); 4363 4364 buildWidenedRemergeToDst(DstReg, LCMTy, Parts); 4365 4366 MI.eraseFromParent(); 4367 return Legalized; 4368 } 4369 4370 LegalizerHelper::LegalizeResult 4371 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, 4372 LLT NarrowTy) { 4373 using namespace TargetOpcode; 4374 4375 switch (MI.getOpcode()) { 4376 case G_IMPLICIT_DEF: 4377 return fewerElementsVectorImplicitDef(MI, TypeIdx, NarrowTy); 4378 case G_TRUNC: 4379 case G_AND: 4380 case G_OR: 4381 case G_XOR: 4382 case G_ADD: 4383 case G_SUB: 4384 case G_MUL: 4385 case G_PTR_ADD: 4386 case G_SMULH: 4387 case G_UMULH: 4388 case G_FADD: 4389 case G_FMUL: 4390 case G_FSUB: 4391 case G_FNEG: 4392 case G_FABS: 4393 case G_FCANONICALIZE: 4394 case G_FDIV: 4395 case G_FREM: 4396 case G_FMA: 4397 case G_FMAD: 4398 case G_FPOW: 4399 case G_FEXP: 4400 case G_FEXP2: 4401 case G_FLOG: 4402 case G_FLOG2: 4403 case G_FLOG10: 4404 case G_FNEARBYINT: 4405 case G_FCEIL: 4406 case G_FFLOOR: 4407 case G_FRINT: 4408 case G_INTRINSIC_ROUND: 4409 case G_INTRINSIC_ROUNDEVEN: 4410 case G_INTRINSIC_TRUNC: 4411 case G_FCOS: 4412 case G_FSIN: 4413 case G_FSQRT: 4414 case G_BSWAP: 4415 case G_BITREVERSE: 4416 case G_SDIV: 4417 case G_UDIV: 4418 case G_SREM: 4419 case G_UREM: 4420 case G_SDIVREM: 4421 case G_UDIVREM: 4422 case G_SMIN: 4423 case G_SMAX: 4424 case G_UMIN: 4425 case G_UMAX: 4426 case G_ABS: 4427 case G_FMINNUM: 4428 case G_FMAXNUM: 4429 case G_FMINNUM_IEEE: 4430 case G_FMAXNUM_IEEE: 4431 case G_FMINIMUM: 4432 case G_FMAXIMUM: 4433 case G_FSHL: 4434 case G_FSHR: 4435 case G_ROTL: 4436 case G_ROTR: 4437 case G_FREEZE: 4438 case G_SADDSAT: 4439 case G_SSUBSAT: 4440 case G_UADDSAT: 4441 case G_USUBSAT: 4442 return reduceOperationWidth(MI, TypeIdx, NarrowTy); 4443 case G_UMULO: 4444 case G_SMULO: 4445 return fewerElementsVectorMulo(MI, TypeIdx, NarrowTy); 4446 case G_SHL: 4447 case G_LSHR: 4448 case G_ASHR: 4449 case G_SSHLSAT: 4450 case G_USHLSAT: 4451 case G_CTLZ: 4452 case G_CTLZ_ZERO_UNDEF: 4453 case G_CTTZ: 4454 case G_CTTZ_ZERO_UNDEF: 4455 case G_CTPOP: 4456 case G_FCOPYSIGN: 4457 return fewerElementsVectorMultiEltType(MI, TypeIdx, NarrowTy); 4458 case G_ZEXT: 4459 case G_SEXT: 4460 case G_ANYEXT: 4461 case G_FPEXT: 4462 case G_FPTRUNC: 4463 case G_SITOFP: 4464 case G_UITOFP: 4465 case G_FPTOSI: 4466 case G_FPTOUI: 4467 case G_INTTOPTR: 4468 case G_PTRTOINT: 4469 case G_ADDRSPACE_CAST: 4470 return fewerElementsVectorCasts(MI, TypeIdx, NarrowTy); 4471 case G_ICMP: 4472 case G_FCMP: 4473 return fewerElementsVectorCmp(MI, TypeIdx, NarrowTy); 4474 case G_SELECT: 4475 return fewerElementsVectorSelect(MI, TypeIdx, NarrowTy); 4476 case G_PHI: 4477 return fewerElementsVectorPhi(MI, TypeIdx, NarrowTy); 4478 case G_UNMERGE_VALUES: 4479 return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy); 4480 case G_BUILD_VECTOR: 4481 assert(TypeIdx == 0 && "not a vector type index"); 4482 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy); 4483 case G_CONCAT_VECTORS: 4484 if (TypeIdx != 1) // TODO: This probably does work as expected already. 4485 return UnableToLegalize; 4486 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy); 4487 case G_EXTRACT_VECTOR_ELT: 4488 case G_INSERT_VECTOR_ELT: 4489 return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy); 4490 case G_LOAD: 4491 case G_STORE: 4492 return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy); 4493 case G_SEXT_INREG: 4494 return fewerElementsVectorSextInReg(MI, TypeIdx, NarrowTy); 4495 GISEL_VECREDUCE_CASES_NONSEQ 4496 return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy); 4497 case G_SHUFFLE_VECTOR: 4498 return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy); 4499 default: 4500 return UnableToLegalize; 4501 } 4502 } 4503 4504 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle( 4505 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) { 4506 assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); 4507 if (TypeIdx != 0) 4508 return UnableToLegalize; 4509 4510 Register DstReg = MI.getOperand(0).getReg(); 4511 Register Src1Reg = MI.getOperand(1).getReg(); 4512 Register Src2Reg = MI.getOperand(2).getReg(); 4513 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); 4514 LLT DstTy = MRI.getType(DstReg); 4515 LLT Src1Ty = MRI.getType(Src1Reg); 4516 LLT Src2Ty = MRI.getType(Src2Reg); 4517 // The shuffle should be canonicalized by now. 4518 if (DstTy != Src1Ty) 4519 return UnableToLegalize; 4520 if (DstTy != Src2Ty) 4521 return UnableToLegalize; 4522 4523 if (!isPowerOf2_32(DstTy.getNumElements())) 4524 return UnableToLegalize; 4525 4526 // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly. 4527 // Further legalization attempts will be needed to do split further. 4528 NarrowTy = 4529 DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2)); 4530 unsigned NewElts = NarrowTy.getNumElements(); 4531 4532 SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs; 4533 extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs); 4534 extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs); 4535 Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0], 4536 SplitSrc2Regs[1]}; 4537 4538 Register Hi, Lo; 4539 4540 // If Lo or Hi uses elements from at most two of the four input vectors, then 4541 // express it as a vector shuffle of those two inputs. Otherwise extract the 4542 // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR. 4543 SmallVector<int, 16> Ops; 4544 for (unsigned High = 0; High < 2; ++High) { 4545 Register &Output = High ? Hi : Lo; 4546 4547 // Build a shuffle mask for the output, discovering on the fly which 4548 // input vectors to use as shuffle operands (recorded in InputUsed). 4549 // If building a suitable shuffle vector proves too hard, then bail 4550 // out with useBuildVector set. 4551 unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered. 4552 unsigned FirstMaskIdx = High * NewElts; 4553 bool UseBuildVector = false; 4554 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) { 4555 // The mask element. This indexes into the input. 4556 int Idx = Mask[FirstMaskIdx + MaskOffset]; 4557 4558 // The input vector this mask element indexes into. 4559 unsigned Input = (unsigned)Idx / NewElts; 4560 4561 if (Input >= array_lengthof(Inputs)) { 4562 // The mask element does not index into any input vector. 4563 Ops.push_back(-1); 4564 continue; 4565 } 4566 4567 // Turn the index into an offset from the start of the input vector. 4568 Idx -= Input * NewElts; 4569 4570 // Find or create a shuffle vector operand to hold this input. 4571 unsigned OpNo; 4572 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { 4573 if (InputUsed[OpNo] == Input) { 4574 // This input vector is already an operand. 4575 break; 4576 } else if (InputUsed[OpNo] == -1U) { 4577 // Create a new operand for this input vector. 4578 InputUsed[OpNo] = Input; 4579 break; 4580 } 4581 } 4582 4583 if (OpNo >= array_lengthof(InputUsed)) { 4584 // More than two input vectors used! Give up on trying to create a 4585 // shuffle vector. Insert all elements into a BUILD_VECTOR instead. 4586 UseBuildVector = true; 4587 break; 4588 } 4589 4590 // Add the mask index for the new shuffle vector. 4591 Ops.push_back(Idx + OpNo * NewElts); 4592 } 4593 4594 if (UseBuildVector) { 4595 LLT EltTy = NarrowTy.getElementType(); 4596 SmallVector<Register, 16> SVOps; 4597 4598 // Extract the input elements by hand. 4599 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) { 4600 // The mask element. This indexes into the input. 4601 int Idx = Mask[FirstMaskIdx + MaskOffset]; 4602 4603 // The input vector this mask element indexes into. 4604 unsigned Input = (unsigned)Idx / NewElts; 4605 4606 if (Input >= array_lengthof(Inputs)) { 4607 // The mask element is "undef" or indexes off the end of the input. 4608 SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0)); 4609 continue; 4610 } 4611 4612 // Turn the index into an offset from the start of the input vector. 4613 Idx -= Input * NewElts; 4614 4615 // Extract the vector element by hand. 4616 SVOps.push_back(MIRBuilder 4617 .buildExtractVectorElement( 4618 EltTy, Inputs[Input], 4619 MIRBuilder.buildConstant(LLT::scalar(32), Idx)) 4620 .getReg(0)); 4621 } 4622 4623 // Construct the Lo/Hi output using a G_BUILD_VECTOR. 4624 Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0); 4625 } else if (InputUsed[0] == -1U) { 4626 // No input vectors were used! The result is undefined. 4627 Output = MIRBuilder.buildUndef(NarrowTy).getReg(0); 4628 } else { 4629 Register Op0 = Inputs[InputUsed[0]]; 4630 // If only one input was used, use an undefined vector for the other. 4631 Register Op1 = InputUsed[1] == -1U 4632 ? MIRBuilder.buildUndef(NarrowTy).getReg(0) 4633 : Inputs[InputUsed[1]]; 4634 // At least one input vector was used. Create a new shuffle vector. 4635 Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0); 4636 } 4637 4638 Ops.clear(); 4639 } 4640 4641 MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi}); 4642 MI.eraseFromParent(); 4643 return Legalized; 4644 } 4645 4646 static unsigned getScalarOpcForReduction(unsigned Opc) { 4647 unsigned ScalarOpc; 4648 switch (Opc) { 4649 case TargetOpcode::G_VECREDUCE_FADD: 4650 ScalarOpc = TargetOpcode::G_FADD; 4651 break; 4652 case TargetOpcode::G_VECREDUCE_FMUL: 4653 ScalarOpc = TargetOpcode::G_FMUL; 4654 break; 4655 case TargetOpcode::G_VECREDUCE_FMAX: 4656 ScalarOpc = TargetOpcode::G_FMAXNUM; 4657 break; 4658 case TargetOpcode::G_VECREDUCE_FMIN: 4659 ScalarOpc = TargetOpcode::G_FMINNUM; 4660 break; 4661 case TargetOpcode::G_VECREDUCE_ADD: 4662 ScalarOpc = TargetOpcode::G_ADD; 4663 break; 4664 case TargetOpcode::G_VECREDUCE_MUL: 4665 ScalarOpc = TargetOpcode::G_MUL; 4666 break; 4667 case TargetOpcode::G_VECREDUCE_AND: 4668 ScalarOpc = TargetOpcode::G_AND; 4669 break; 4670 case TargetOpcode::G_VECREDUCE_OR: 4671 ScalarOpc = TargetOpcode::G_OR; 4672 break; 4673 case TargetOpcode::G_VECREDUCE_XOR: 4674 ScalarOpc = TargetOpcode::G_XOR; 4675 break; 4676 case TargetOpcode::G_VECREDUCE_SMAX: 4677 ScalarOpc = TargetOpcode::G_SMAX; 4678 break; 4679 case TargetOpcode::G_VECREDUCE_SMIN: 4680 ScalarOpc = TargetOpcode::G_SMIN; 4681 break; 4682 case TargetOpcode::G_VECREDUCE_UMAX: 4683 ScalarOpc = TargetOpcode::G_UMAX; 4684 break; 4685 case TargetOpcode::G_VECREDUCE_UMIN: 4686 ScalarOpc = TargetOpcode::G_UMIN; 4687 break; 4688 default: 4689 llvm_unreachable("Unhandled reduction"); 4690 } 4691 return ScalarOpc; 4692 } 4693 4694 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions( 4695 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) { 4696 unsigned Opc = MI.getOpcode(); 4697 assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD && 4698 Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL && 4699 "Sequential reductions not expected"); 4700 4701 if (TypeIdx != 1) 4702 return UnableToLegalize; 4703 4704 // The semantics of the normal non-sequential reductions allow us to freely 4705 // re-associate the operation. 4706 Register SrcReg = MI.getOperand(1).getReg(); 4707 LLT SrcTy = MRI.getType(SrcReg); 4708 Register DstReg = MI.getOperand(0).getReg(); 4709 LLT DstTy = MRI.getType(DstReg); 4710 4711 if (NarrowTy.isVector() && 4712 (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0)) 4713 return UnableToLegalize; 4714 4715 unsigned ScalarOpc = getScalarOpcForReduction(Opc); 4716 SmallVector<Register> SplitSrcs; 4717 // If NarrowTy is a scalar then we're being asked to scalarize. 4718 const unsigned NumParts = 4719 NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements() 4720 : SrcTy.getNumElements(); 4721 4722 extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs); 4723 if (NarrowTy.isScalar()) { 4724 if (DstTy != NarrowTy) 4725 return UnableToLegalize; // FIXME: handle implicit extensions. 4726 4727 if (isPowerOf2_32(NumParts)) { 4728 // Generate a tree of scalar operations to reduce the critical path. 4729 SmallVector<Register> PartialResults; 4730 unsigned NumPartsLeft = NumParts; 4731 while (NumPartsLeft > 1) { 4732 for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) { 4733 PartialResults.emplace_back( 4734 MIRBuilder 4735 .buildInstr(ScalarOpc, {NarrowTy}, 4736 {SplitSrcs[Idx], SplitSrcs[Idx + 1]}) 4737 .getReg(0)); 4738 } 4739 SplitSrcs = PartialResults; 4740 PartialResults.clear(); 4741 NumPartsLeft = SplitSrcs.size(); 4742 } 4743 assert(SplitSrcs.size() == 1); 4744 MIRBuilder.buildCopy(DstReg, SplitSrcs[0]); 4745 MI.eraseFromParent(); 4746 return Legalized; 4747 } 4748 // If we can't generate a tree, then just do sequential operations. 4749 Register Acc = SplitSrcs[0]; 4750 for (unsigned Idx = 1; Idx < NumParts; ++Idx) 4751 Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]}) 4752 .getReg(0); 4753 MIRBuilder.buildCopy(DstReg, Acc); 4754 MI.eraseFromParent(); 4755 return Legalized; 4756 } 4757 SmallVector<Register> PartialReductions; 4758 for (unsigned Part = 0; Part < NumParts; ++Part) { 4759 PartialReductions.push_back( 4760 MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0)); 4761 } 4762 4763 4764 // If the types involved are powers of 2, we can generate intermediate vector 4765 // ops, before generating a final reduction operation. 4766 if (isPowerOf2_32(SrcTy.getNumElements()) && 4767 isPowerOf2_32(NarrowTy.getNumElements())) { 4768 return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc); 4769 } 4770 4771 Register Acc = PartialReductions[0]; 4772 for (unsigned Part = 1; Part < NumParts; ++Part) { 4773 if (Part == NumParts - 1) { 4774 MIRBuilder.buildInstr(ScalarOpc, {DstReg}, 4775 {Acc, PartialReductions[Part]}); 4776 } else { 4777 Acc = MIRBuilder 4778 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]}) 4779 .getReg(0); 4780 } 4781 } 4782 MI.eraseFromParent(); 4783 return Legalized; 4784 } 4785 4786 LegalizerHelper::LegalizeResult 4787 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg, 4788 LLT SrcTy, LLT NarrowTy, 4789 unsigned ScalarOpc) { 4790 SmallVector<Register> SplitSrcs; 4791 // Split the sources into NarrowTy size pieces. 4792 extractParts(SrcReg, NarrowTy, 4793 SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs); 4794 // We're going to do a tree reduction using vector operations until we have 4795 // one NarrowTy size value left. 4796 while (SplitSrcs.size() > 1) { 4797 SmallVector<Register> PartialRdxs; 4798 for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) { 4799 Register LHS = SplitSrcs[Idx]; 4800 Register RHS = SplitSrcs[Idx + 1]; 4801 // Create the intermediate vector op. 4802 Register Res = 4803 MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0); 4804 PartialRdxs.push_back(Res); 4805 } 4806 SplitSrcs = std::move(PartialRdxs); 4807 } 4808 // Finally generate the requested NarrowTy based reduction. 4809 Observer.changingInstr(MI); 4810 MI.getOperand(1).setReg(SplitSrcs[0]); 4811 Observer.changedInstr(MI); 4812 return Legalized; 4813 } 4814 4815 LegalizerHelper::LegalizeResult 4816 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt, 4817 const LLT HalfTy, const LLT AmtTy) { 4818 4819 Register InL = MRI.createGenericVirtualRegister(HalfTy); 4820 Register InH = MRI.createGenericVirtualRegister(HalfTy); 4821 MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1)); 4822 4823 if (Amt.isZero()) { 4824 MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH}); 4825 MI.eraseFromParent(); 4826 return Legalized; 4827 } 4828 4829 LLT NVT = HalfTy; 4830 unsigned NVTBits = HalfTy.getSizeInBits(); 4831 unsigned VTBits = 2 * NVTBits; 4832 4833 SrcOp Lo(Register(0)), Hi(Register(0)); 4834 if (MI.getOpcode() == TargetOpcode::G_SHL) { 4835 if (Amt.ugt(VTBits)) { 4836 Lo = Hi = MIRBuilder.buildConstant(NVT, 0); 4837 } else if (Amt.ugt(NVTBits)) { 4838 Lo = MIRBuilder.buildConstant(NVT, 0); 4839 Hi = MIRBuilder.buildShl(NVT, InL, 4840 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits)); 4841 } else if (Amt == NVTBits) { 4842 Lo = MIRBuilder.buildConstant(NVT, 0); 4843 Hi = InL; 4844 } else { 4845 Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt)); 4846 auto OrLHS = 4847 MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt)); 4848 auto OrRHS = MIRBuilder.buildLShr( 4849 NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits)); 4850 Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS); 4851 } 4852 } else if (MI.getOpcode() == TargetOpcode::G_LSHR) { 4853 if (Amt.ugt(VTBits)) { 4854 Lo = Hi = MIRBuilder.buildConstant(NVT, 0); 4855 } else if (Amt.ugt(NVTBits)) { 4856 Lo = MIRBuilder.buildLShr(NVT, InH, 4857 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits)); 4858 Hi = MIRBuilder.buildConstant(NVT, 0); 4859 } else if (Amt == NVTBits) { 4860 Lo = InH; 4861 Hi = MIRBuilder.buildConstant(NVT, 0); 4862 } else { 4863 auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt); 4864 4865 auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst); 4866 auto OrRHS = MIRBuilder.buildShl( 4867 NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits)); 4868 4869 Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS); 4870 Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst); 4871 } 4872 } else { 4873 if (Amt.ugt(VTBits)) { 4874 Hi = Lo = MIRBuilder.buildAShr( 4875 NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1)); 4876 } else if (Amt.ugt(NVTBits)) { 4877 Lo = MIRBuilder.buildAShr(NVT, InH, 4878 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits)); 4879 Hi = MIRBuilder.buildAShr(NVT, InH, 4880 MIRBuilder.buildConstant(AmtTy, NVTBits - 1)); 4881 } else if (Amt == NVTBits) { 4882 Lo = InH; 4883 Hi = MIRBuilder.buildAShr(NVT, InH, 4884 MIRBuilder.buildConstant(AmtTy, NVTBits - 1)); 4885 } else { 4886 auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt); 4887 4888 auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst); 4889 auto OrRHS = MIRBuilder.buildShl( 4890 NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits)); 4891 4892 Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS); 4893 Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst); 4894 } 4895 } 4896 4897 MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi}); 4898 MI.eraseFromParent(); 4899 4900 return Legalized; 4901 } 4902 4903 // TODO: Optimize if constant shift amount. 4904 LegalizerHelper::LegalizeResult 4905 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx, 4906 LLT RequestedTy) { 4907 if (TypeIdx == 1) { 4908 Observer.changingInstr(MI); 4909 narrowScalarSrc(MI, RequestedTy, 2); 4910 Observer.changedInstr(MI); 4911 return Legalized; 4912 } 4913 4914 Register DstReg = MI.getOperand(0).getReg(); 4915 LLT DstTy = MRI.getType(DstReg); 4916 if (DstTy.isVector()) 4917 return UnableToLegalize; 4918 4919 Register Amt = MI.getOperand(2).getReg(); 4920 LLT ShiftAmtTy = MRI.getType(Amt); 4921 const unsigned DstEltSize = DstTy.getScalarSizeInBits(); 4922 if (DstEltSize % 2 != 0) 4923 return UnableToLegalize; 4924 4925 // Ignore the input type. We can only go to exactly half the size of the 4926 // input. If that isn't small enough, the resulting pieces will be further 4927 // legalized. 4928 const unsigned NewBitSize = DstEltSize / 2; 4929 const LLT HalfTy = LLT::scalar(NewBitSize); 4930 const LLT CondTy = LLT::scalar(1); 4931 4932 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) { 4933 return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy, 4934 ShiftAmtTy); 4935 } 4936 4937 // TODO: Expand with known bits. 4938 4939 // Handle the fully general expansion by an unknown amount. 4940 auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize); 4941 4942 Register InL = MRI.createGenericVirtualRegister(HalfTy); 4943 Register InH = MRI.createGenericVirtualRegister(HalfTy); 4944 MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1)); 4945 4946 auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits); 4947 auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt); 4948 4949 auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0); 4950 auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits); 4951 auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero); 4952 4953 Register ResultRegs[2]; 4954 switch (MI.getOpcode()) { 4955 case TargetOpcode::G_SHL: { 4956 // Short: ShAmt < NewBitSize 4957 auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt); 4958 4959 auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack); 4960 auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt); 4961 auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr); 4962 4963 // Long: ShAmt >= NewBitSize 4964 auto LoL = MIRBuilder.buildConstant(HalfTy, 0); // Lo part is zero. 4965 auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part. 4966 4967 auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL); 4968 auto Hi = MIRBuilder.buildSelect( 4969 HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL)); 4970 4971 ResultRegs[0] = Lo.getReg(0); 4972 ResultRegs[1] = Hi.getReg(0); 4973 break; 4974 } 4975 case TargetOpcode::G_LSHR: 4976 case TargetOpcode::G_ASHR: { 4977 // Short: ShAmt < NewBitSize 4978 auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt}); 4979 4980 auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt); 4981 auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack); 4982 auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr); 4983 4984 // Long: ShAmt >= NewBitSize 4985 MachineInstrBuilder HiL; 4986 if (MI.getOpcode() == TargetOpcode::G_LSHR) { 4987 HiL = MIRBuilder.buildConstant(HalfTy, 0); // Hi part is zero. 4988 } else { 4989 auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1); 4990 HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt); // Sign of Hi part. 4991 } 4992 auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, 4993 {InH, AmtExcess}); // Lo from Hi part. 4994 4995 auto Lo = MIRBuilder.buildSelect( 4996 HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL)); 4997 4998 auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL); 4999 5000 ResultRegs[0] = Lo.getReg(0); 5001 ResultRegs[1] = Hi.getReg(0); 5002 break; 5003 } 5004 default: 5005 llvm_unreachable("not a shift"); 5006 } 5007 5008 MIRBuilder.buildMerge(DstReg, ResultRegs); 5009 MI.eraseFromParent(); 5010 return Legalized; 5011 } 5012 5013 LegalizerHelper::LegalizeResult 5014 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx, 5015 LLT MoreTy) { 5016 assert(TypeIdx == 0 && "Expecting only Idx 0"); 5017 5018 Observer.changingInstr(MI); 5019 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 5020 MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB(); 5021 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator()); 5022 moreElementsVectorSrc(MI, MoreTy, I); 5023 } 5024 5025 MachineBasicBlock &MBB = *MI.getParent(); 5026 MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI()); 5027 moreElementsVectorDst(MI, MoreTy, 0); 5028 Observer.changedInstr(MI); 5029 return Legalized; 5030 } 5031 5032 LegalizerHelper::LegalizeResult 5033 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, 5034 LLT MoreTy) { 5035 unsigned Opc = MI.getOpcode(); 5036 switch (Opc) { 5037 case TargetOpcode::G_IMPLICIT_DEF: 5038 case TargetOpcode::G_LOAD: { 5039 if (TypeIdx != 0) 5040 return UnableToLegalize; 5041 Observer.changingInstr(MI); 5042 moreElementsVectorDst(MI, MoreTy, 0); 5043 Observer.changedInstr(MI); 5044 return Legalized; 5045 } 5046 case TargetOpcode::G_STORE: 5047 if (TypeIdx != 0) 5048 return UnableToLegalize; 5049 Observer.changingInstr(MI); 5050 moreElementsVectorSrc(MI, MoreTy, 0); 5051 Observer.changedInstr(MI); 5052 return Legalized; 5053 case TargetOpcode::G_AND: 5054 case TargetOpcode::G_OR: 5055 case TargetOpcode::G_XOR: 5056 case TargetOpcode::G_SMIN: 5057 case TargetOpcode::G_SMAX: 5058 case TargetOpcode::G_UMIN: 5059 case TargetOpcode::G_UMAX: 5060 case TargetOpcode::G_FMINNUM: 5061 case TargetOpcode::G_FMAXNUM: 5062 case TargetOpcode::G_FMINNUM_IEEE: 5063 case TargetOpcode::G_FMAXNUM_IEEE: 5064 case TargetOpcode::G_FMINIMUM: 5065 case TargetOpcode::G_FMAXIMUM: { 5066 Observer.changingInstr(MI); 5067 moreElementsVectorSrc(MI, MoreTy, 1); 5068 moreElementsVectorSrc(MI, MoreTy, 2); 5069 moreElementsVectorDst(MI, MoreTy, 0); 5070 Observer.changedInstr(MI); 5071 return Legalized; 5072 } 5073 case TargetOpcode::G_EXTRACT: 5074 if (TypeIdx != 1) 5075 return UnableToLegalize; 5076 Observer.changingInstr(MI); 5077 moreElementsVectorSrc(MI, MoreTy, 1); 5078 Observer.changedInstr(MI); 5079 return Legalized; 5080 case TargetOpcode::G_INSERT: 5081 case TargetOpcode::G_FREEZE: 5082 if (TypeIdx != 0) 5083 return UnableToLegalize; 5084 Observer.changingInstr(MI); 5085 moreElementsVectorSrc(MI, MoreTy, 1); 5086 moreElementsVectorDst(MI, MoreTy, 0); 5087 Observer.changedInstr(MI); 5088 return Legalized; 5089 case TargetOpcode::G_SELECT: 5090 if (TypeIdx != 0) 5091 return UnableToLegalize; 5092 if (MRI.getType(MI.getOperand(1).getReg()).isVector()) 5093 return UnableToLegalize; 5094 5095 Observer.changingInstr(MI); 5096 moreElementsVectorSrc(MI, MoreTy, 2); 5097 moreElementsVectorSrc(MI, MoreTy, 3); 5098 moreElementsVectorDst(MI, MoreTy, 0); 5099 Observer.changedInstr(MI); 5100 return Legalized; 5101 case TargetOpcode::G_UNMERGE_VALUES: { 5102 if (TypeIdx != 1) 5103 return UnableToLegalize; 5104 5105 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 5106 int NumDst = MI.getNumOperands() - 1; 5107 moreElementsVectorSrc(MI, MoreTy, NumDst); 5108 5109 auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES); 5110 for (int I = 0; I != NumDst; ++I) 5111 MIB.addDef(MI.getOperand(I).getReg()); 5112 5113 int NewNumDst = MoreTy.getSizeInBits() / DstTy.getSizeInBits(); 5114 for (int I = NumDst; I != NewNumDst; ++I) 5115 MIB.addDef(MRI.createGenericVirtualRegister(DstTy)); 5116 5117 MIB.addUse(MI.getOperand(NumDst).getReg()); 5118 MI.eraseFromParent(); 5119 return Legalized; 5120 } 5121 case TargetOpcode::G_PHI: 5122 return moreElementsVectorPhi(MI, TypeIdx, MoreTy); 5123 case TargetOpcode::G_SHUFFLE_VECTOR: 5124 return moreElementsVectorShuffle(MI, TypeIdx, MoreTy); 5125 default: 5126 return UnableToLegalize; 5127 } 5128 } 5129 5130 LegalizerHelper::LegalizeResult 5131 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI, 5132 unsigned int TypeIdx, LLT MoreTy) { 5133 if (TypeIdx != 0) 5134 return UnableToLegalize; 5135 5136 Register DstReg = MI.getOperand(0).getReg(); 5137 Register Src1Reg = MI.getOperand(1).getReg(); 5138 Register Src2Reg = MI.getOperand(2).getReg(); 5139 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); 5140 LLT DstTy = MRI.getType(DstReg); 5141 LLT Src1Ty = MRI.getType(Src1Reg); 5142 LLT Src2Ty = MRI.getType(Src2Reg); 5143 unsigned NumElts = DstTy.getNumElements(); 5144 unsigned WidenNumElts = MoreTy.getNumElements(); 5145 5146 // Expect a canonicalized shuffle. 5147 if (DstTy != Src1Ty || DstTy != Src2Ty) 5148 return UnableToLegalize; 5149 5150 moreElementsVectorSrc(MI, MoreTy, 1); 5151 moreElementsVectorSrc(MI, MoreTy, 2); 5152 5153 // Adjust mask based on new input vector length. 5154 SmallVector<int, 16> NewMask; 5155 for (unsigned I = 0; I != NumElts; ++I) { 5156 int Idx = Mask[I]; 5157 if (Idx < static_cast<int>(NumElts)) 5158 NewMask.push_back(Idx); 5159 else 5160 NewMask.push_back(Idx - NumElts + WidenNumElts); 5161 } 5162 for (unsigned I = NumElts; I != WidenNumElts; ++I) 5163 NewMask.push_back(-1); 5164 moreElementsVectorDst(MI, MoreTy, 0); 5165 MIRBuilder.setInstrAndDebugLoc(MI); 5166 MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(), 5167 MI.getOperand(1).getReg(), 5168 MI.getOperand(2).getReg(), NewMask); 5169 MI.eraseFromParent(); 5170 return Legalized; 5171 } 5172 5173 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs, 5174 ArrayRef<Register> Src1Regs, 5175 ArrayRef<Register> Src2Regs, 5176 LLT NarrowTy) { 5177 MachineIRBuilder &B = MIRBuilder; 5178 unsigned SrcParts = Src1Regs.size(); 5179 unsigned DstParts = DstRegs.size(); 5180 5181 unsigned DstIdx = 0; // Low bits of the result. 5182 Register FactorSum = 5183 B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0); 5184 DstRegs[DstIdx] = FactorSum; 5185 5186 unsigned CarrySumPrevDstIdx; 5187 SmallVector<Register, 4> Factors; 5188 5189 for (DstIdx = 1; DstIdx < DstParts; DstIdx++) { 5190 // Collect low parts of muls for DstIdx. 5191 for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1; 5192 i <= std::min(DstIdx, SrcParts - 1); ++i) { 5193 MachineInstrBuilder Mul = 5194 B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]); 5195 Factors.push_back(Mul.getReg(0)); 5196 } 5197 // Collect high parts of muls from previous DstIdx. 5198 for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts; 5199 i <= std::min(DstIdx - 1, SrcParts - 1); ++i) { 5200 MachineInstrBuilder Umulh = 5201 B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]); 5202 Factors.push_back(Umulh.getReg(0)); 5203 } 5204 // Add CarrySum from additions calculated for previous DstIdx. 5205 if (DstIdx != 1) { 5206 Factors.push_back(CarrySumPrevDstIdx); 5207 } 5208 5209 Register CarrySum; 5210 // Add all factors and accumulate all carries into CarrySum. 5211 if (DstIdx != DstParts - 1) { 5212 MachineInstrBuilder Uaddo = 5213 B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]); 5214 FactorSum = Uaddo.getReg(0); 5215 CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0); 5216 for (unsigned i = 2; i < Factors.size(); ++i) { 5217 MachineInstrBuilder Uaddo = 5218 B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]); 5219 FactorSum = Uaddo.getReg(0); 5220 MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1)); 5221 CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0); 5222 } 5223 } else { 5224 // Since value for the next index is not calculated, neither is CarrySum. 5225 FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0); 5226 for (unsigned i = 2; i < Factors.size(); ++i) 5227 FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0); 5228 } 5229 5230 CarrySumPrevDstIdx = CarrySum; 5231 DstRegs[DstIdx] = FactorSum; 5232 Factors.clear(); 5233 } 5234 } 5235 5236 LegalizerHelper::LegalizeResult 5237 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx, 5238 LLT NarrowTy) { 5239 if (TypeIdx != 0) 5240 return UnableToLegalize; 5241 5242 Register DstReg = MI.getOperand(0).getReg(); 5243 LLT DstType = MRI.getType(DstReg); 5244 // FIXME: add support for vector types 5245 if (DstType.isVector()) 5246 return UnableToLegalize; 5247 5248 unsigned Opcode = MI.getOpcode(); 5249 unsigned OpO, OpE, OpF; 5250 switch (Opcode) { 5251 case TargetOpcode::G_SADDO: 5252 case TargetOpcode::G_SADDE: 5253 case TargetOpcode::G_UADDO: 5254 case TargetOpcode::G_UADDE: 5255 case TargetOpcode::G_ADD: 5256 OpO = TargetOpcode::G_UADDO; 5257 OpE = TargetOpcode::G_UADDE; 5258 OpF = TargetOpcode::G_UADDE; 5259 if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE) 5260 OpF = TargetOpcode::G_SADDE; 5261 break; 5262 case TargetOpcode::G_SSUBO: 5263 case TargetOpcode::G_SSUBE: 5264 case TargetOpcode::G_USUBO: 5265 case TargetOpcode::G_USUBE: 5266 case TargetOpcode::G_SUB: 5267 OpO = TargetOpcode::G_USUBO; 5268 OpE = TargetOpcode::G_USUBE; 5269 OpF = TargetOpcode::G_USUBE; 5270 if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE) 5271 OpF = TargetOpcode::G_SSUBE; 5272 break; 5273 default: 5274 llvm_unreachable("Unexpected add/sub opcode!"); 5275 } 5276 5277 // 1 for a plain add/sub, 2 if this is an operation with a carry-out. 5278 unsigned NumDefs = MI.getNumExplicitDefs(); 5279 Register Src1 = MI.getOperand(NumDefs).getReg(); 5280 Register Src2 = MI.getOperand(NumDefs + 1).getReg(); 5281 Register CarryDst, CarryIn; 5282 if (NumDefs == 2) 5283 CarryDst = MI.getOperand(1).getReg(); 5284 if (MI.getNumOperands() == NumDefs + 3) 5285 CarryIn = MI.getOperand(NumDefs + 2).getReg(); 5286 5287 LLT RegTy = MRI.getType(MI.getOperand(0).getReg()); 5288 LLT LeftoverTy, DummyTy; 5289 SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs; 5290 extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left); 5291 extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left); 5292 5293 int NarrowParts = Src1Regs.size(); 5294 for (int I = 0, E = Src1Left.size(); I != E; ++I) { 5295 Src1Regs.push_back(Src1Left[I]); 5296 Src2Regs.push_back(Src2Left[I]); 5297 } 5298 DstRegs.reserve(Src1Regs.size()); 5299 5300 for (int i = 0, e = Src1Regs.size(); i != e; ++i) { 5301 Register DstReg = 5302 MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i])); 5303 Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1)); 5304 // Forward the final carry-out to the destination register 5305 if (i == e - 1 && CarryDst) 5306 CarryOut = CarryDst; 5307 5308 if (!CarryIn) { 5309 MIRBuilder.buildInstr(OpO, {DstReg, CarryOut}, 5310 {Src1Regs[i], Src2Regs[i]}); 5311 } else if (i == e - 1) { 5312 MIRBuilder.buildInstr(OpF, {DstReg, CarryOut}, 5313 {Src1Regs[i], Src2Regs[i], CarryIn}); 5314 } else { 5315 MIRBuilder.buildInstr(OpE, {DstReg, CarryOut}, 5316 {Src1Regs[i], Src2Regs[i], CarryIn}); 5317 } 5318 5319 DstRegs.push_back(DstReg); 5320 CarryIn = CarryOut; 5321 } 5322 insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy, 5323 makeArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy, 5324 makeArrayRef(DstRegs).drop_front(NarrowParts)); 5325 5326 MI.eraseFromParent(); 5327 return Legalized; 5328 } 5329 5330 LegalizerHelper::LegalizeResult 5331 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) { 5332 Register DstReg = MI.getOperand(0).getReg(); 5333 Register Src1 = MI.getOperand(1).getReg(); 5334 Register Src2 = MI.getOperand(2).getReg(); 5335 5336 LLT Ty = MRI.getType(DstReg); 5337 if (Ty.isVector()) 5338 return UnableToLegalize; 5339 5340 unsigned Size = Ty.getSizeInBits(); 5341 unsigned NarrowSize = NarrowTy.getSizeInBits(); 5342 if (Size % NarrowSize != 0) 5343 return UnableToLegalize; 5344 5345 unsigned NumParts = Size / NarrowSize; 5346 bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH; 5347 unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1); 5348 5349 SmallVector<Register, 2> Src1Parts, Src2Parts; 5350 SmallVector<Register, 2> DstTmpRegs(DstTmpParts); 5351 extractParts(Src1, NarrowTy, NumParts, Src1Parts); 5352 extractParts(Src2, NarrowTy, NumParts, Src2Parts); 5353 multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy); 5354 5355 // Take only high half of registers if this is high mul. 5356 ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts); 5357 MIRBuilder.buildMerge(DstReg, DstRegs); 5358 MI.eraseFromParent(); 5359 return Legalized; 5360 } 5361 5362 LegalizerHelper::LegalizeResult 5363 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx, 5364 LLT NarrowTy) { 5365 if (TypeIdx != 0) 5366 return UnableToLegalize; 5367 5368 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI; 5369 5370 Register Src = MI.getOperand(1).getReg(); 5371 LLT SrcTy = MRI.getType(Src); 5372 5373 // If all finite floats fit into the narrowed integer type, we can just swap 5374 // out the result type. This is practically only useful for conversions from 5375 // half to at least 16-bits, so just handle the one case. 5376 if (SrcTy.getScalarType() != LLT::scalar(16) || 5377 NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u)) 5378 return UnableToLegalize; 5379 5380 Observer.changingInstr(MI); 5381 narrowScalarDst(MI, NarrowTy, 0, 5382 IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT); 5383 Observer.changedInstr(MI); 5384 return Legalized; 5385 } 5386 5387 LegalizerHelper::LegalizeResult 5388 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx, 5389 LLT NarrowTy) { 5390 if (TypeIdx != 1) 5391 return UnableToLegalize; 5392 5393 uint64_t NarrowSize = NarrowTy.getSizeInBits(); 5394 5395 int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 5396 // FIXME: add support for when SizeOp1 isn't an exact multiple of 5397 // NarrowSize. 5398 if (SizeOp1 % NarrowSize != 0) 5399 return UnableToLegalize; 5400 int NumParts = SizeOp1 / NarrowSize; 5401 5402 SmallVector<Register, 2> SrcRegs, DstRegs; 5403 SmallVector<uint64_t, 2> Indexes; 5404 extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs); 5405 5406 Register OpReg = MI.getOperand(0).getReg(); 5407 uint64_t OpStart = MI.getOperand(2).getImm(); 5408 uint64_t OpSize = MRI.getType(OpReg).getSizeInBits(); 5409 for (int i = 0; i < NumParts; ++i) { 5410 unsigned SrcStart = i * NarrowSize; 5411 5412 if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) { 5413 // No part of the extract uses this subregister, ignore it. 5414 continue; 5415 } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) { 5416 // The entire subregister is extracted, forward the value. 5417 DstRegs.push_back(SrcRegs[i]); 5418 continue; 5419 } 5420 5421 // OpSegStart is where this destination segment would start in OpReg if it 5422 // extended infinitely in both directions. 5423 int64_t ExtractOffset; 5424 uint64_t SegSize; 5425 if (OpStart < SrcStart) { 5426 ExtractOffset = 0; 5427 SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart); 5428 } else { 5429 ExtractOffset = OpStart - SrcStart; 5430 SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize); 5431 } 5432 5433 Register SegReg = SrcRegs[i]; 5434 if (ExtractOffset != 0 || SegSize != NarrowSize) { 5435 // A genuine extract is needed. 5436 SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize)); 5437 MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset); 5438 } 5439 5440 DstRegs.push_back(SegReg); 5441 } 5442 5443 Register DstReg = MI.getOperand(0).getReg(); 5444 if (MRI.getType(DstReg).isVector()) 5445 MIRBuilder.buildBuildVector(DstReg, DstRegs); 5446 else if (DstRegs.size() > 1) 5447 MIRBuilder.buildMerge(DstReg, DstRegs); 5448 else 5449 MIRBuilder.buildCopy(DstReg, DstRegs[0]); 5450 MI.eraseFromParent(); 5451 return Legalized; 5452 } 5453 5454 LegalizerHelper::LegalizeResult 5455 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx, 5456 LLT NarrowTy) { 5457 // FIXME: Don't know how to handle secondary types yet. 5458 if (TypeIdx != 0) 5459 return UnableToLegalize; 5460 5461 SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs; 5462 SmallVector<uint64_t, 2> Indexes; 5463 LLT RegTy = MRI.getType(MI.getOperand(0).getReg()); 5464 LLT LeftoverTy; 5465 extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs, 5466 LeftoverRegs); 5467 5468 for (Register Reg : LeftoverRegs) 5469 SrcRegs.push_back(Reg); 5470 5471 uint64_t NarrowSize = NarrowTy.getSizeInBits(); 5472 Register OpReg = MI.getOperand(2).getReg(); 5473 uint64_t OpStart = MI.getOperand(3).getImm(); 5474 uint64_t OpSize = MRI.getType(OpReg).getSizeInBits(); 5475 for (int I = 0, E = SrcRegs.size(); I != E; ++I) { 5476 unsigned DstStart = I * NarrowSize; 5477 5478 if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) { 5479 // The entire subregister is defined by this insert, forward the new 5480 // value. 5481 DstRegs.push_back(OpReg); 5482 continue; 5483 } 5484 5485 Register SrcReg = SrcRegs[I]; 5486 if (MRI.getType(SrcRegs[I]) == LeftoverTy) { 5487 // The leftover reg is smaller than NarrowTy, so we need to extend it. 5488 SrcReg = MRI.createGenericVirtualRegister(NarrowTy); 5489 MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]); 5490 } 5491 5492 if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) { 5493 // No part of the insert affects this subregister, forward the original. 5494 DstRegs.push_back(SrcReg); 5495 continue; 5496 } 5497 5498 // OpSegStart is where this destination segment would start in OpReg if it 5499 // extended infinitely in both directions. 5500 int64_t ExtractOffset, InsertOffset; 5501 uint64_t SegSize; 5502 if (OpStart < DstStart) { 5503 InsertOffset = 0; 5504 ExtractOffset = DstStart - OpStart; 5505 SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart); 5506 } else { 5507 InsertOffset = OpStart - DstStart; 5508 ExtractOffset = 0; 5509 SegSize = 5510 std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart); 5511 } 5512 5513 Register SegReg = OpReg; 5514 if (ExtractOffset != 0 || SegSize != OpSize) { 5515 // A genuine extract is needed. 5516 SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize)); 5517 MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset); 5518 } 5519 5520 Register DstReg = MRI.createGenericVirtualRegister(NarrowTy); 5521 MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset); 5522 DstRegs.push_back(DstReg); 5523 } 5524 5525 uint64_t WideSize = DstRegs.size() * NarrowSize; 5526 Register DstReg = MI.getOperand(0).getReg(); 5527 if (WideSize > RegTy.getSizeInBits()) { 5528 Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize)); 5529 MIRBuilder.buildMerge(MergeReg, DstRegs); 5530 MIRBuilder.buildTrunc(DstReg, MergeReg); 5531 } else 5532 MIRBuilder.buildMerge(DstReg, DstRegs); 5533 5534 MI.eraseFromParent(); 5535 return Legalized; 5536 } 5537 5538 LegalizerHelper::LegalizeResult 5539 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx, 5540 LLT NarrowTy) { 5541 Register DstReg = MI.getOperand(0).getReg(); 5542 LLT DstTy = MRI.getType(DstReg); 5543 5544 assert(MI.getNumOperands() == 3 && TypeIdx == 0); 5545 5546 SmallVector<Register, 4> DstRegs, DstLeftoverRegs; 5547 SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs; 5548 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs; 5549 LLT LeftoverTy; 5550 if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy, 5551 Src0Regs, Src0LeftoverRegs)) 5552 return UnableToLegalize; 5553 5554 LLT Unused; 5555 if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused, 5556 Src1Regs, Src1LeftoverRegs)) 5557 llvm_unreachable("inconsistent extractParts result"); 5558 5559 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) { 5560 auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy}, 5561 {Src0Regs[I], Src1Regs[I]}); 5562 DstRegs.push_back(Inst.getReg(0)); 5563 } 5564 5565 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) { 5566 auto Inst = MIRBuilder.buildInstr( 5567 MI.getOpcode(), 5568 {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]}); 5569 DstLeftoverRegs.push_back(Inst.getReg(0)); 5570 } 5571 5572 insertParts(DstReg, DstTy, NarrowTy, DstRegs, 5573 LeftoverTy, DstLeftoverRegs); 5574 5575 MI.eraseFromParent(); 5576 return Legalized; 5577 } 5578 5579 LegalizerHelper::LegalizeResult 5580 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx, 5581 LLT NarrowTy) { 5582 if (TypeIdx != 0) 5583 return UnableToLegalize; 5584 5585 Register DstReg = MI.getOperand(0).getReg(); 5586 Register SrcReg = MI.getOperand(1).getReg(); 5587 5588 LLT DstTy = MRI.getType(DstReg); 5589 if (DstTy.isVector()) 5590 return UnableToLegalize; 5591 5592 SmallVector<Register, 8> Parts; 5593 LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg); 5594 LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode()); 5595 buildWidenedRemergeToDst(DstReg, LCMTy, Parts); 5596 5597 MI.eraseFromParent(); 5598 return Legalized; 5599 } 5600 5601 LegalizerHelper::LegalizeResult 5602 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx, 5603 LLT NarrowTy) { 5604 if (TypeIdx != 0) 5605 return UnableToLegalize; 5606 5607 Register CondReg = MI.getOperand(1).getReg(); 5608 LLT CondTy = MRI.getType(CondReg); 5609 if (CondTy.isVector()) // TODO: Handle vselect 5610 return UnableToLegalize; 5611 5612 Register DstReg = MI.getOperand(0).getReg(); 5613 LLT DstTy = MRI.getType(DstReg); 5614 5615 SmallVector<Register, 4> DstRegs, DstLeftoverRegs; 5616 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs; 5617 SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs; 5618 LLT LeftoverTy; 5619 if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy, 5620 Src1Regs, Src1LeftoverRegs)) 5621 return UnableToLegalize; 5622 5623 LLT Unused; 5624 if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused, 5625 Src2Regs, Src2LeftoverRegs)) 5626 llvm_unreachable("inconsistent extractParts result"); 5627 5628 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) { 5629 auto Select = MIRBuilder.buildSelect(NarrowTy, 5630 CondReg, Src1Regs[I], Src2Regs[I]); 5631 DstRegs.push_back(Select.getReg(0)); 5632 } 5633 5634 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) { 5635 auto Select = MIRBuilder.buildSelect( 5636 LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]); 5637 DstLeftoverRegs.push_back(Select.getReg(0)); 5638 } 5639 5640 insertParts(DstReg, DstTy, NarrowTy, DstRegs, 5641 LeftoverTy, DstLeftoverRegs); 5642 5643 MI.eraseFromParent(); 5644 return Legalized; 5645 } 5646 5647 LegalizerHelper::LegalizeResult 5648 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx, 5649 LLT NarrowTy) { 5650 if (TypeIdx != 1) 5651 return UnableToLegalize; 5652 5653 Register DstReg = MI.getOperand(0).getReg(); 5654 Register SrcReg = MI.getOperand(1).getReg(); 5655 LLT DstTy = MRI.getType(DstReg); 5656 LLT SrcTy = MRI.getType(SrcReg); 5657 unsigned NarrowSize = NarrowTy.getSizeInBits(); 5658 5659 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { 5660 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF; 5661 5662 MachineIRBuilder &B = MIRBuilder; 5663 auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg); 5664 // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi) 5665 auto C_0 = B.buildConstant(NarrowTy, 0); 5666 auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), 5667 UnmergeSrc.getReg(1), C_0); 5668 auto LoCTLZ = IsUndef ? 5669 B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) : 5670 B.buildCTLZ(DstTy, UnmergeSrc.getReg(0)); 5671 auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize); 5672 auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize); 5673 auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)); 5674 B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ); 5675 5676 MI.eraseFromParent(); 5677 return Legalized; 5678 } 5679 5680 return UnableToLegalize; 5681 } 5682 5683 LegalizerHelper::LegalizeResult 5684 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx, 5685 LLT NarrowTy) { 5686 if (TypeIdx != 1) 5687 return UnableToLegalize; 5688 5689 Register DstReg = MI.getOperand(0).getReg(); 5690 Register SrcReg = MI.getOperand(1).getReg(); 5691 LLT DstTy = MRI.getType(DstReg); 5692 LLT SrcTy = MRI.getType(SrcReg); 5693 unsigned NarrowSize = NarrowTy.getSizeInBits(); 5694 5695 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { 5696 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF; 5697 5698 MachineIRBuilder &B = MIRBuilder; 5699 auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg); 5700 // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo) 5701 auto C_0 = B.buildConstant(NarrowTy, 0); 5702 auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), 5703 UnmergeSrc.getReg(0), C_0); 5704 auto HiCTTZ = IsUndef ? 5705 B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) : 5706 B.buildCTTZ(DstTy, UnmergeSrc.getReg(1)); 5707 auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize); 5708 auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize); 5709 auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)); 5710 B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ); 5711 5712 MI.eraseFromParent(); 5713 return Legalized; 5714 } 5715 5716 return UnableToLegalize; 5717 } 5718 5719 LegalizerHelper::LegalizeResult 5720 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, 5721 LLT NarrowTy) { 5722 if (TypeIdx != 1) 5723 return UnableToLegalize; 5724 5725 Register DstReg = MI.getOperand(0).getReg(); 5726 LLT DstTy = MRI.getType(DstReg); 5727 LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); 5728 unsigned NarrowSize = NarrowTy.getSizeInBits(); 5729 5730 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { 5731 auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1)); 5732 5733 auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0)); 5734 auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1)); 5735 MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP); 5736 5737 MI.eraseFromParent(); 5738 return Legalized; 5739 } 5740 5741 return UnableToLegalize; 5742 } 5743 5744 LegalizerHelper::LegalizeResult 5745 LegalizerHelper::lowerBitCount(MachineInstr &MI) { 5746 unsigned Opc = MI.getOpcode(); 5747 const auto &TII = MIRBuilder.getTII(); 5748 auto isSupported = [this](const LegalityQuery &Q) { 5749 auto QAction = LI.getAction(Q).Action; 5750 return QAction == Legal || QAction == Libcall || QAction == Custom; 5751 }; 5752 switch (Opc) { 5753 default: 5754 return UnableToLegalize; 5755 case TargetOpcode::G_CTLZ_ZERO_UNDEF: { 5756 // This trivially expands to CTLZ. 5757 Observer.changingInstr(MI); 5758 MI.setDesc(TII.get(TargetOpcode::G_CTLZ)); 5759 Observer.changedInstr(MI); 5760 return Legalized; 5761 } 5762 case TargetOpcode::G_CTLZ: { 5763 Register DstReg = MI.getOperand(0).getReg(); 5764 Register SrcReg = MI.getOperand(1).getReg(); 5765 LLT DstTy = MRI.getType(DstReg); 5766 LLT SrcTy = MRI.getType(SrcReg); 5767 unsigned Len = SrcTy.getSizeInBits(); 5768 5769 if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) { 5770 // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero. 5771 auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg); 5772 auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0); 5773 auto ICmp = MIRBuilder.buildICmp( 5774 CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc); 5775 auto LenConst = MIRBuilder.buildConstant(DstTy, Len); 5776 MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU); 5777 MI.eraseFromParent(); 5778 return Legalized; 5779 } 5780 // for now, we do this: 5781 // NewLen = NextPowerOf2(Len); 5782 // x = x | (x >> 1); 5783 // x = x | (x >> 2); 5784 // ... 5785 // x = x | (x >>16); 5786 // x = x | (x >>32); // for 64-bit input 5787 // Upto NewLen/2 5788 // return Len - popcount(x); 5789 // 5790 // Ref: "Hacker's Delight" by Henry Warren 5791 Register Op = SrcReg; 5792 unsigned NewLen = PowerOf2Ceil(Len); 5793 for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) { 5794 auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i); 5795 auto MIBOp = MIRBuilder.buildOr( 5796 SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt)); 5797 Op = MIBOp.getReg(0); 5798 } 5799 auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op); 5800 MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len), 5801 MIBPop); 5802 MI.eraseFromParent(); 5803 return Legalized; 5804 } 5805 case TargetOpcode::G_CTTZ_ZERO_UNDEF: { 5806 // This trivially expands to CTTZ. 5807 Observer.changingInstr(MI); 5808 MI.setDesc(TII.get(TargetOpcode::G_CTTZ)); 5809 Observer.changedInstr(MI); 5810 return Legalized; 5811 } 5812 case TargetOpcode::G_CTTZ: { 5813 Register DstReg = MI.getOperand(0).getReg(); 5814 Register SrcReg = MI.getOperand(1).getReg(); 5815 LLT DstTy = MRI.getType(DstReg); 5816 LLT SrcTy = MRI.getType(SrcReg); 5817 5818 unsigned Len = SrcTy.getSizeInBits(); 5819 if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) { 5820 // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with 5821 // zero. 5822 auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg); 5823 auto Zero = MIRBuilder.buildConstant(SrcTy, 0); 5824 auto ICmp = MIRBuilder.buildICmp( 5825 CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero); 5826 auto LenConst = MIRBuilder.buildConstant(DstTy, Len); 5827 MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU); 5828 MI.eraseFromParent(); 5829 return Legalized; 5830 } 5831 // for now, we use: { return popcount(~x & (x - 1)); } 5832 // unless the target has ctlz but not ctpop, in which case we use: 5833 // { return 32 - nlz(~x & (x-1)); } 5834 // Ref: "Hacker's Delight" by Henry Warren 5835 auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1); 5836 auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1); 5837 auto MIBTmp = MIRBuilder.buildAnd( 5838 SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1)); 5839 if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) && 5840 isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) { 5841 auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len); 5842 MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen, 5843 MIRBuilder.buildCTLZ(SrcTy, MIBTmp)); 5844 MI.eraseFromParent(); 5845 return Legalized; 5846 } 5847 MI.setDesc(TII.get(TargetOpcode::G_CTPOP)); 5848 MI.getOperand(1).setReg(MIBTmp.getReg(0)); 5849 return Legalized; 5850 } 5851 case TargetOpcode::G_CTPOP: { 5852 Register SrcReg = MI.getOperand(1).getReg(); 5853 LLT Ty = MRI.getType(SrcReg); 5854 unsigned Size = Ty.getSizeInBits(); 5855 MachineIRBuilder &B = MIRBuilder; 5856 5857 // Count set bits in blocks of 2 bits. Default approach would be 5858 // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 } 5859 // We use following formula instead: 5860 // B2Count = val - { (val >> 1) & 0x55555555 } 5861 // since it gives same result in blocks of 2 with one instruction less. 5862 auto C_1 = B.buildConstant(Ty, 1); 5863 auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1); 5864 APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55)); 5865 auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0); 5866 auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0); 5867 auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi); 5868 5869 // In order to get count in blocks of 4 add values from adjacent block of 2. 5870 // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 } 5871 auto C_2 = B.buildConstant(Ty, 2); 5872 auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2); 5873 APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33)); 5874 auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0); 5875 auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0); 5876 auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0); 5877 auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count); 5878 5879 // For count in blocks of 8 bits we don't have to mask high 4 bits before 5880 // addition since count value sits in range {0,...,8} and 4 bits are enough 5881 // to hold such binary values. After addition high 4 bits still hold count 5882 // of set bits in high 4 bit block, set them to zero and get 8 bit result. 5883 // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F 5884 auto C_4 = B.buildConstant(Ty, 4); 5885 auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4); 5886 auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count); 5887 APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F)); 5888 auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0); 5889 auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0); 5890 5891 assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm"); 5892 // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this 5893 // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks. 5894 auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01))); 5895 auto ResTmp = B.buildMul(Ty, B8Count, MulMask); 5896 5897 // Shift count result from 8 high bits to low bits. 5898 auto C_SizeM8 = B.buildConstant(Ty, Size - 8); 5899 B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8); 5900 5901 MI.eraseFromParent(); 5902 return Legalized; 5903 } 5904 } 5905 } 5906 5907 // Check that (every element of) Reg is undef or not an exact multiple of BW. 5908 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI, 5909 Register Reg, unsigned BW) { 5910 return matchUnaryPredicate( 5911 MRI, Reg, 5912 [=](const Constant *C) { 5913 // Null constant here means an undef. 5914 const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C); 5915 return !CI || CI->getValue().urem(BW) != 0; 5916 }, 5917 /*AllowUndefs*/ true); 5918 } 5919 5920 LegalizerHelper::LegalizeResult 5921 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) { 5922 Register Dst = MI.getOperand(0).getReg(); 5923 Register X = MI.getOperand(1).getReg(); 5924 Register Y = MI.getOperand(2).getReg(); 5925 Register Z = MI.getOperand(3).getReg(); 5926 LLT Ty = MRI.getType(Dst); 5927 LLT ShTy = MRI.getType(Z); 5928 5929 unsigned BW = Ty.getScalarSizeInBits(); 5930 5931 if (!isPowerOf2_32(BW)) 5932 return UnableToLegalize; 5933 5934 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL; 5935 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL; 5936 5937 if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) { 5938 // fshl X, Y, Z -> fshr X, Y, -Z 5939 // fshr X, Y, Z -> fshl X, Y, -Z 5940 auto Zero = MIRBuilder.buildConstant(ShTy, 0); 5941 Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0); 5942 } else { 5943 // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z 5944 // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z 5945 auto One = MIRBuilder.buildConstant(ShTy, 1); 5946 if (IsFSHL) { 5947 Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0); 5948 X = MIRBuilder.buildLShr(Ty, X, One).getReg(0); 5949 } else { 5950 X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0); 5951 Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0); 5952 } 5953 5954 Z = MIRBuilder.buildNot(ShTy, Z).getReg(0); 5955 } 5956 5957 MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z}); 5958 MI.eraseFromParent(); 5959 return Legalized; 5960 } 5961 5962 LegalizerHelper::LegalizeResult 5963 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) { 5964 Register Dst = MI.getOperand(0).getReg(); 5965 Register X = MI.getOperand(1).getReg(); 5966 Register Y = MI.getOperand(2).getReg(); 5967 Register Z = MI.getOperand(3).getReg(); 5968 LLT Ty = MRI.getType(Dst); 5969 LLT ShTy = MRI.getType(Z); 5970 5971 const unsigned BW = Ty.getScalarSizeInBits(); 5972 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL; 5973 5974 Register ShX, ShY; 5975 Register ShAmt, InvShAmt; 5976 5977 // FIXME: Emit optimized urem by constant instead of letting it expand later. 5978 if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) { 5979 // fshl: X << C | Y >> (BW - C) 5980 // fshr: X << (BW - C) | Y >> C 5981 // where C = Z % BW is not zero 5982 auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW); 5983 ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0); 5984 InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0); 5985 ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0); 5986 ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0); 5987 } else { 5988 // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW)) 5989 // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW) 5990 auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1); 5991 if (isPowerOf2_32(BW)) { 5992 // Z % BW -> Z & (BW - 1) 5993 ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0); 5994 // (BW - 1) - (Z % BW) -> ~Z & (BW - 1) 5995 auto NotZ = MIRBuilder.buildNot(ShTy, Z); 5996 InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0); 5997 } else { 5998 auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW); 5999 ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0); 6000 InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0); 6001 } 6002 6003 auto One = MIRBuilder.buildConstant(ShTy, 1); 6004 if (IsFSHL) { 6005 ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0); 6006 auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One); 6007 ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0); 6008 } else { 6009 auto ShX1 = MIRBuilder.buildShl(Ty, X, One); 6010 ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0); 6011 ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0); 6012 } 6013 } 6014 6015 MIRBuilder.buildOr(Dst, ShX, ShY); 6016 MI.eraseFromParent(); 6017 return Legalized; 6018 } 6019 6020 LegalizerHelper::LegalizeResult 6021 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) { 6022 // These operations approximately do the following (while avoiding undefined 6023 // shifts by BW): 6024 // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 6025 // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 6026 Register Dst = MI.getOperand(0).getReg(); 6027 LLT Ty = MRI.getType(Dst); 6028 LLT ShTy = MRI.getType(MI.getOperand(3).getReg()); 6029 6030 bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL; 6031 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL; 6032 6033 // TODO: Use smarter heuristic that accounts for vector legalization. 6034 if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower) 6035 return lowerFunnelShiftAsShifts(MI); 6036 6037 // This only works for powers of 2, fallback to shifts if it fails. 6038 LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI); 6039 if (Result == UnableToLegalize) 6040 return lowerFunnelShiftAsShifts(MI); 6041 return Result; 6042 } 6043 6044 LegalizerHelper::LegalizeResult 6045 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) { 6046 Register Dst = MI.getOperand(0).getReg(); 6047 Register Src = MI.getOperand(1).getReg(); 6048 Register Amt = MI.getOperand(2).getReg(); 6049 LLT AmtTy = MRI.getType(Amt); 6050 auto Zero = MIRBuilder.buildConstant(AmtTy, 0); 6051 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL; 6052 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL; 6053 auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt); 6054 MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg}); 6055 MI.eraseFromParent(); 6056 return Legalized; 6057 } 6058 6059 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) { 6060 Register Dst = MI.getOperand(0).getReg(); 6061 Register Src = MI.getOperand(1).getReg(); 6062 Register Amt = MI.getOperand(2).getReg(); 6063 LLT DstTy = MRI.getType(Dst); 6064 LLT SrcTy = MRI.getType(Src); 6065 LLT AmtTy = MRI.getType(Amt); 6066 6067 unsigned EltSizeInBits = DstTy.getScalarSizeInBits(); 6068 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL; 6069 6070 MIRBuilder.setInstrAndDebugLoc(MI); 6071 6072 // If a rotate in the other direction is supported, use it. 6073 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL; 6074 if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) && 6075 isPowerOf2_32(EltSizeInBits)) 6076 return lowerRotateWithReverseRotate(MI); 6077 6078 // If a funnel shift is supported, use it. 6079 unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR; 6080 unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR; 6081 bool IsFShLegal = false; 6082 if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) || 6083 LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) { 6084 auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2, 6085 Register R3) { 6086 MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3}); 6087 MI.eraseFromParent(); 6088 return Legalized; 6089 }; 6090 // If a funnel shift in the other direction is supported, use it. 6091 if (IsFShLegal) { 6092 return buildFunnelShift(FShOpc, Dst, Src, Amt); 6093 } else if (isPowerOf2_32(EltSizeInBits)) { 6094 Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0); 6095 return buildFunnelShift(RevFsh, Dst, Src, Amt); 6096 } 6097 } 6098 6099 auto Zero = MIRBuilder.buildConstant(AmtTy, 0); 6100 unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR; 6101 unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL; 6102 auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1); 6103 Register ShVal; 6104 Register RevShiftVal; 6105 if (isPowerOf2_32(EltSizeInBits)) { 6106 // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1)) 6107 // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1)) 6108 auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt); 6109 auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC); 6110 ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0); 6111 auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC); 6112 RevShiftVal = 6113 MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0); 6114 } else { 6115 // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w)) 6116 // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w)) 6117 auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits); 6118 auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC); 6119 ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0); 6120 auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt); 6121 auto One = MIRBuilder.buildConstant(AmtTy, 1); 6122 auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One}); 6123 RevShiftVal = 6124 MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0); 6125 } 6126 MIRBuilder.buildOr(Dst, ShVal, RevShiftVal); 6127 MI.eraseFromParent(); 6128 return Legalized; 6129 } 6130 6131 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float 6132 // representation. 6133 LegalizerHelper::LegalizeResult 6134 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) { 6135 Register Dst = MI.getOperand(0).getReg(); 6136 Register Src = MI.getOperand(1).getReg(); 6137 const LLT S64 = LLT::scalar(64); 6138 const LLT S32 = LLT::scalar(32); 6139 const LLT S1 = LLT::scalar(1); 6140 6141 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32); 6142 6143 // unsigned cul2f(ulong u) { 6144 // uint lz = clz(u); 6145 // uint e = (u != 0) ? 127U + 63U - lz : 0; 6146 // u = (u << lz) & 0x7fffffffffffffffUL; 6147 // ulong t = u & 0xffffffffffUL; 6148 // uint v = (e << 23) | (uint)(u >> 40); 6149 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U); 6150 // return as_float(v + r); 6151 // } 6152 6153 auto Zero32 = MIRBuilder.buildConstant(S32, 0); 6154 auto Zero64 = MIRBuilder.buildConstant(S64, 0); 6155 6156 auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src); 6157 6158 auto K = MIRBuilder.buildConstant(S32, 127U + 63U); 6159 auto Sub = MIRBuilder.buildSub(S32, K, LZ); 6160 6161 auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64); 6162 auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32); 6163 6164 auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1); 6165 auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ); 6166 6167 auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0); 6168 6169 auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL); 6170 auto T = MIRBuilder.buildAnd(S64, U, Mask1); 6171 6172 auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40)); 6173 auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23)); 6174 auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl)); 6175 6176 auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL); 6177 auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C); 6178 auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C); 6179 auto One = MIRBuilder.buildConstant(S32, 1); 6180 6181 auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One); 6182 auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32); 6183 auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0); 6184 MIRBuilder.buildAdd(Dst, V, R); 6185 6186 MI.eraseFromParent(); 6187 return Legalized; 6188 } 6189 6190 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) { 6191 Register Dst = MI.getOperand(0).getReg(); 6192 Register Src = MI.getOperand(1).getReg(); 6193 LLT DstTy = MRI.getType(Dst); 6194 LLT SrcTy = MRI.getType(Src); 6195 6196 if (SrcTy == LLT::scalar(1)) { 6197 auto True = MIRBuilder.buildFConstant(DstTy, 1.0); 6198 auto False = MIRBuilder.buildFConstant(DstTy, 0.0); 6199 MIRBuilder.buildSelect(Dst, Src, True, False); 6200 MI.eraseFromParent(); 6201 return Legalized; 6202 } 6203 6204 if (SrcTy != LLT::scalar(64)) 6205 return UnableToLegalize; 6206 6207 if (DstTy == LLT::scalar(32)) { 6208 // TODO: SelectionDAG has several alternative expansions to port which may 6209 // be more reasonble depending on the available instructions. If a target 6210 // has sitofp, does not have CTLZ, or can efficiently use f64 as an 6211 // intermediate type, this is probably worse. 6212 return lowerU64ToF32BitOps(MI); 6213 } 6214 6215 return UnableToLegalize; 6216 } 6217 6218 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) { 6219 Register Dst = MI.getOperand(0).getReg(); 6220 Register Src = MI.getOperand(1).getReg(); 6221 LLT DstTy = MRI.getType(Dst); 6222 LLT SrcTy = MRI.getType(Src); 6223 6224 const LLT S64 = LLT::scalar(64); 6225 const LLT S32 = LLT::scalar(32); 6226 const LLT S1 = LLT::scalar(1); 6227 6228 if (SrcTy == S1) { 6229 auto True = MIRBuilder.buildFConstant(DstTy, -1.0); 6230 auto False = MIRBuilder.buildFConstant(DstTy, 0.0); 6231 MIRBuilder.buildSelect(Dst, Src, True, False); 6232 MI.eraseFromParent(); 6233 return Legalized; 6234 } 6235 6236 if (SrcTy != S64) 6237 return UnableToLegalize; 6238 6239 if (DstTy == S32) { 6240 // signed cl2f(long l) { 6241 // long s = l >> 63; 6242 // float r = cul2f((l + s) ^ s); 6243 // return s ? -r : r; 6244 // } 6245 Register L = Src; 6246 auto SignBit = MIRBuilder.buildConstant(S64, 63); 6247 auto S = MIRBuilder.buildAShr(S64, L, SignBit); 6248 6249 auto LPlusS = MIRBuilder.buildAdd(S64, L, S); 6250 auto Xor = MIRBuilder.buildXor(S64, LPlusS, S); 6251 auto R = MIRBuilder.buildUITOFP(S32, Xor); 6252 6253 auto RNeg = MIRBuilder.buildFNeg(S32, R); 6254 auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S, 6255 MIRBuilder.buildConstant(S64, 0)); 6256 MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R); 6257 MI.eraseFromParent(); 6258 return Legalized; 6259 } 6260 6261 return UnableToLegalize; 6262 } 6263 6264 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) { 6265 Register Dst = MI.getOperand(0).getReg(); 6266 Register Src = MI.getOperand(1).getReg(); 6267 LLT DstTy = MRI.getType(Dst); 6268 LLT SrcTy = MRI.getType(Src); 6269 const LLT S64 = LLT::scalar(64); 6270 const LLT S32 = LLT::scalar(32); 6271 6272 if (SrcTy != S64 && SrcTy != S32) 6273 return UnableToLegalize; 6274 if (DstTy != S32 && DstTy != S64) 6275 return UnableToLegalize; 6276 6277 // FPTOSI gives same result as FPTOUI for positive signed integers. 6278 // FPTOUI needs to deal with fp values that convert to unsigned integers 6279 // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp. 6280 6281 APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits()); 6282 APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle() 6283 : APFloat::IEEEdouble(), 6284 APInt::getZero(SrcTy.getSizeInBits())); 6285 TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven); 6286 6287 MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src); 6288 6289 MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP); 6290 // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on 6291 // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1. 6292 MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold); 6293 MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub); 6294 MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt); 6295 MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit); 6296 6297 const LLT S1 = LLT::scalar(1); 6298 6299 MachineInstrBuilder FCMP = 6300 MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold); 6301 MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res); 6302 6303 MI.eraseFromParent(); 6304 return Legalized; 6305 } 6306 6307 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) { 6308 Register Dst = MI.getOperand(0).getReg(); 6309 Register Src = MI.getOperand(1).getReg(); 6310 LLT DstTy = MRI.getType(Dst); 6311 LLT SrcTy = MRI.getType(Src); 6312 const LLT S64 = LLT::scalar(64); 6313 const LLT S32 = LLT::scalar(32); 6314 6315 // FIXME: Only f32 to i64 conversions are supported. 6316 if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64) 6317 return UnableToLegalize; 6318 6319 // Expand f32 -> i64 conversion 6320 // This algorithm comes from compiler-rt's implementation of fixsfdi: 6321 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c 6322 6323 unsigned SrcEltBits = SrcTy.getScalarSizeInBits(); 6324 6325 auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000); 6326 auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23); 6327 6328 auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask); 6329 auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit); 6330 6331 auto SignMask = MIRBuilder.buildConstant(SrcTy, 6332 APInt::getSignMask(SrcEltBits)); 6333 auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask); 6334 auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1); 6335 auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit); 6336 Sign = MIRBuilder.buildSExt(DstTy, Sign); 6337 6338 auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF); 6339 auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask); 6340 auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000); 6341 6342 auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K); 6343 R = MIRBuilder.buildZExt(DstTy, R); 6344 6345 auto Bias = MIRBuilder.buildConstant(SrcTy, 127); 6346 auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias); 6347 auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit); 6348 auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent); 6349 6350 auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent); 6351 auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub); 6352 6353 const LLT S1 = LLT::scalar(1); 6354 auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, 6355 S1, Exponent, ExponentLoBit); 6356 6357 R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl); 6358 6359 auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign); 6360 auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign); 6361 6362 auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0); 6363 6364 auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, 6365 S1, Exponent, ZeroSrcTy); 6366 6367 auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0); 6368 MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret); 6369 6370 MI.eraseFromParent(); 6371 return Legalized; 6372 } 6373 6374 // f64 -> f16 conversion using round-to-nearest-even rounding mode. 6375 LegalizerHelper::LegalizeResult 6376 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) { 6377 Register Dst = MI.getOperand(0).getReg(); 6378 Register Src = MI.getOperand(1).getReg(); 6379 6380 if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly. 6381 return UnableToLegalize; 6382 6383 const unsigned ExpMask = 0x7ff; 6384 const unsigned ExpBiasf64 = 1023; 6385 const unsigned ExpBiasf16 = 15; 6386 const LLT S32 = LLT::scalar(32); 6387 const LLT S1 = LLT::scalar(1); 6388 6389 auto Unmerge = MIRBuilder.buildUnmerge(S32, Src); 6390 Register U = Unmerge.getReg(0); 6391 Register UH = Unmerge.getReg(1); 6392 6393 auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20)); 6394 E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask)); 6395 6396 // Subtract the fp64 exponent bias (1023) to get the real exponent and 6397 // add the f16 bias (15) to get the biased exponent for the f16 format. 6398 E = MIRBuilder.buildAdd( 6399 S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16)); 6400 6401 auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8)); 6402 M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe)); 6403 6404 auto MaskedSig = MIRBuilder.buildAnd(S32, UH, 6405 MIRBuilder.buildConstant(S32, 0x1ff)); 6406 MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U); 6407 6408 auto Zero = MIRBuilder.buildConstant(S32, 0); 6409 auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero); 6410 auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0); 6411 M = MIRBuilder.buildOr(S32, M, Lo40Set); 6412 6413 // (M != 0 ? 0x0200 : 0) | 0x7c00; 6414 auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200); 6415 auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero); 6416 auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero); 6417 6418 auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00); 6419 auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00); 6420 6421 // N = M | (E << 12); 6422 auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12)); 6423 auto N = MIRBuilder.buildOr(S32, M, EShl12); 6424 6425 // B = clamp(1-E, 0, 13); 6426 auto One = MIRBuilder.buildConstant(S32, 1); 6427 auto OneSubExp = MIRBuilder.buildSub(S32, One, E); 6428 auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero); 6429 B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13)); 6430 6431 auto SigSetHigh = MIRBuilder.buildOr(S32, M, 6432 MIRBuilder.buildConstant(S32, 0x1000)); 6433 6434 auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B); 6435 auto D0 = MIRBuilder.buildShl(S32, D, B); 6436 6437 auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, 6438 D0, SigSetHigh); 6439 auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh); 6440 D = MIRBuilder.buildOr(S32, D, D1); 6441 6442 auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One); 6443 auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N); 6444 6445 auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7)); 6446 V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2)); 6447 6448 auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3, 6449 MIRBuilder.buildConstant(S32, 3)); 6450 auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3); 6451 6452 auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3, 6453 MIRBuilder.buildConstant(S32, 5)); 6454 auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5); 6455 6456 V1 = MIRBuilder.buildOr(S32, V0, V1); 6457 V = MIRBuilder.buildAdd(S32, V, V1); 6458 6459 auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, 6460 E, MIRBuilder.buildConstant(S32, 30)); 6461 V = MIRBuilder.buildSelect(S32, CmpEGt30, 6462 MIRBuilder.buildConstant(S32, 0x7c00), V); 6463 6464 auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, 6465 E, MIRBuilder.buildConstant(S32, 1039)); 6466 V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V); 6467 6468 // Extract the sign bit. 6469 auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16)); 6470 Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000)); 6471 6472 // Insert the sign bit 6473 V = MIRBuilder.buildOr(S32, Sign, V); 6474 6475 MIRBuilder.buildTrunc(Dst, V); 6476 MI.eraseFromParent(); 6477 return Legalized; 6478 } 6479 6480 LegalizerHelper::LegalizeResult 6481 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) { 6482 Register Dst = MI.getOperand(0).getReg(); 6483 Register Src = MI.getOperand(1).getReg(); 6484 6485 LLT DstTy = MRI.getType(Dst); 6486 LLT SrcTy = MRI.getType(Src); 6487 const LLT S64 = LLT::scalar(64); 6488 const LLT S16 = LLT::scalar(16); 6489 6490 if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64) 6491 return lowerFPTRUNC_F64_TO_F16(MI); 6492 6493 return UnableToLegalize; 6494 } 6495 6496 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a 6497 // multiplication tree. 6498 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) { 6499 Register Dst = MI.getOperand(0).getReg(); 6500 Register Src0 = MI.getOperand(1).getReg(); 6501 Register Src1 = MI.getOperand(2).getReg(); 6502 LLT Ty = MRI.getType(Dst); 6503 6504 auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1); 6505 MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags()); 6506 MI.eraseFromParent(); 6507 return Legalized; 6508 } 6509 6510 static CmpInst::Predicate minMaxToCompare(unsigned Opc) { 6511 switch (Opc) { 6512 case TargetOpcode::G_SMIN: 6513 return CmpInst::ICMP_SLT; 6514 case TargetOpcode::G_SMAX: 6515 return CmpInst::ICMP_SGT; 6516 case TargetOpcode::G_UMIN: 6517 return CmpInst::ICMP_ULT; 6518 case TargetOpcode::G_UMAX: 6519 return CmpInst::ICMP_UGT; 6520 default: 6521 llvm_unreachable("not in integer min/max"); 6522 } 6523 } 6524 6525 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) { 6526 Register Dst = MI.getOperand(0).getReg(); 6527 Register Src0 = MI.getOperand(1).getReg(); 6528 Register Src1 = MI.getOperand(2).getReg(); 6529 6530 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); 6531 LLT CmpType = MRI.getType(Dst).changeElementSize(1); 6532 6533 auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1); 6534 MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1); 6535 6536 MI.eraseFromParent(); 6537 return Legalized; 6538 } 6539 6540 LegalizerHelper::LegalizeResult 6541 LegalizerHelper::lowerFCopySign(MachineInstr &MI) { 6542 Register Dst = MI.getOperand(0).getReg(); 6543 Register Src0 = MI.getOperand(1).getReg(); 6544 Register Src1 = MI.getOperand(2).getReg(); 6545 6546 const LLT Src0Ty = MRI.getType(Src0); 6547 const LLT Src1Ty = MRI.getType(Src1); 6548 6549 const int Src0Size = Src0Ty.getScalarSizeInBits(); 6550 const int Src1Size = Src1Ty.getScalarSizeInBits(); 6551 6552 auto SignBitMask = MIRBuilder.buildConstant( 6553 Src0Ty, APInt::getSignMask(Src0Size)); 6554 6555 auto NotSignBitMask = MIRBuilder.buildConstant( 6556 Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1)); 6557 6558 Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0); 6559 Register And1; 6560 if (Src0Ty == Src1Ty) { 6561 And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0); 6562 } else if (Src0Size > Src1Size) { 6563 auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size); 6564 auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1); 6565 auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt); 6566 And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0); 6567 } else { 6568 auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size); 6569 auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt); 6570 auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift); 6571 And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0); 6572 } 6573 6574 // Be careful about setting nsz/nnan/ninf on every instruction, since the 6575 // constants are a nan and -0.0, but the final result should preserve 6576 // everything. 6577 unsigned Flags = MI.getFlags(); 6578 MIRBuilder.buildOr(Dst, And0, And1, Flags); 6579 6580 MI.eraseFromParent(); 6581 return Legalized; 6582 } 6583 6584 LegalizerHelper::LegalizeResult 6585 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) { 6586 unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ? 6587 TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE; 6588 6589 Register Dst = MI.getOperand(0).getReg(); 6590 Register Src0 = MI.getOperand(1).getReg(); 6591 Register Src1 = MI.getOperand(2).getReg(); 6592 LLT Ty = MRI.getType(Dst); 6593 6594 if (!MI.getFlag(MachineInstr::FmNoNans)) { 6595 // Insert canonicalizes if it's possible we need to quiet to get correct 6596 // sNaN behavior. 6597 6598 // Note this must be done here, and not as an optimization combine in the 6599 // absence of a dedicate quiet-snan instruction as we're using an 6600 // omni-purpose G_FCANONICALIZE. 6601 if (!isKnownNeverSNaN(Src0, MRI)) 6602 Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0); 6603 6604 if (!isKnownNeverSNaN(Src1, MRI)) 6605 Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0); 6606 } 6607 6608 // If there are no nans, it's safe to simply replace this with the non-IEEE 6609 // version. 6610 MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags()); 6611 MI.eraseFromParent(); 6612 return Legalized; 6613 } 6614 6615 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) { 6616 // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c 6617 Register DstReg = MI.getOperand(0).getReg(); 6618 LLT Ty = MRI.getType(DstReg); 6619 unsigned Flags = MI.getFlags(); 6620 6621 auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2), 6622 Flags); 6623 MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags); 6624 MI.eraseFromParent(); 6625 return Legalized; 6626 } 6627 6628 LegalizerHelper::LegalizeResult 6629 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) { 6630 Register DstReg = MI.getOperand(0).getReg(); 6631 Register X = MI.getOperand(1).getReg(); 6632 const unsigned Flags = MI.getFlags(); 6633 const LLT Ty = MRI.getType(DstReg); 6634 const LLT CondTy = Ty.changeElementSize(1); 6635 6636 // round(x) => 6637 // t = trunc(x); 6638 // d = fabs(x - t); 6639 // o = copysign(1.0f, x); 6640 // return t + (d >= 0.5 ? o : 0.0); 6641 6642 auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags); 6643 6644 auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags); 6645 auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags); 6646 auto Zero = MIRBuilder.buildFConstant(Ty, 0.0); 6647 auto One = MIRBuilder.buildFConstant(Ty, 1.0); 6648 auto Half = MIRBuilder.buildFConstant(Ty, 0.5); 6649 auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X); 6650 6651 auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half, 6652 Flags); 6653 auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags); 6654 6655 MIRBuilder.buildFAdd(DstReg, T, Sel, Flags); 6656 6657 MI.eraseFromParent(); 6658 return Legalized; 6659 } 6660 6661 LegalizerHelper::LegalizeResult 6662 LegalizerHelper::lowerFFloor(MachineInstr &MI) { 6663 Register DstReg = MI.getOperand(0).getReg(); 6664 Register SrcReg = MI.getOperand(1).getReg(); 6665 unsigned Flags = MI.getFlags(); 6666 LLT Ty = MRI.getType(DstReg); 6667 const LLT CondTy = Ty.changeElementSize(1); 6668 6669 // result = trunc(src); 6670 // if (src < 0.0 && src != result) 6671 // result += -1.0. 6672 6673 auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags); 6674 auto Zero = MIRBuilder.buildFConstant(Ty, 0.0); 6675 6676 auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy, 6677 SrcReg, Zero, Flags); 6678 auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy, 6679 SrcReg, Trunc, Flags); 6680 auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc); 6681 auto AddVal = MIRBuilder.buildSITOFP(Ty, And); 6682 6683 MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags); 6684 MI.eraseFromParent(); 6685 return Legalized; 6686 } 6687 6688 LegalizerHelper::LegalizeResult 6689 LegalizerHelper::lowerMergeValues(MachineInstr &MI) { 6690 const unsigned NumOps = MI.getNumOperands(); 6691 Register DstReg = MI.getOperand(0).getReg(); 6692 Register Src0Reg = MI.getOperand(1).getReg(); 6693 LLT DstTy = MRI.getType(DstReg); 6694 LLT SrcTy = MRI.getType(Src0Reg); 6695 unsigned PartSize = SrcTy.getSizeInBits(); 6696 6697 LLT WideTy = LLT::scalar(DstTy.getSizeInBits()); 6698 Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0); 6699 6700 for (unsigned I = 2; I != NumOps; ++I) { 6701 const unsigned Offset = (I - 1) * PartSize; 6702 6703 Register SrcReg = MI.getOperand(I).getReg(); 6704 auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg); 6705 6706 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg : 6707 MRI.createGenericVirtualRegister(WideTy); 6708 6709 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset); 6710 auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt); 6711 MIRBuilder.buildOr(NextResult, ResultReg, Shl); 6712 ResultReg = NextResult; 6713 } 6714 6715 if (DstTy.isPointer()) { 6716 if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace( 6717 DstTy.getAddressSpace())) { 6718 LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n"); 6719 return UnableToLegalize; 6720 } 6721 6722 MIRBuilder.buildIntToPtr(DstReg, ResultReg); 6723 } 6724 6725 MI.eraseFromParent(); 6726 return Legalized; 6727 } 6728 6729 LegalizerHelper::LegalizeResult 6730 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) { 6731 const unsigned NumDst = MI.getNumOperands() - 1; 6732 Register SrcReg = MI.getOperand(NumDst).getReg(); 6733 Register Dst0Reg = MI.getOperand(0).getReg(); 6734 LLT DstTy = MRI.getType(Dst0Reg); 6735 if (DstTy.isPointer()) 6736 return UnableToLegalize; // TODO 6737 6738 SrcReg = coerceToScalar(SrcReg); 6739 if (!SrcReg) 6740 return UnableToLegalize; 6741 6742 // Expand scalarizing unmerge as bitcast to integer and shift. 6743 LLT IntTy = MRI.getType(SrcReg); 6744 6745 MIRBuilder.buildTrunc(Dst0Reg, SrcReg); 6746 6747 const unsigned DstSize = DstTy.getSizeInBits(); 6748 unsigned Offset = DstSize; 6749 for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) { 6750 auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset); 6751 auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt); 6752 MIRBuilder.buildTrunc(MI.getOperand(I), Shift); 6753 } 6754 6755 MI.eraseFromParent(); 6756 return Legalized; 6757 } 6758 6759 /// Lower a vector extract or insert by writing the vector to a stack temporary 6760 /// and reloading the element or vector. 6761 /// 6762 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx 6763 /// => 6764 /// %stack_temp = G_FRAME_INDEX 6765 /// G_STORE %vec, %stack_temp 6766 /// %idx = clamp(%idx, %vec.getNumElements()) 6767 /// %element_ptr = G_PTR_ADD %stack_temp, %idx 6768 /// %dst = G_LOAD %element_ptr 6769 LegalizerHelper::LegalizeResult 6770 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) { 6771 Register DstReg = MI.getOperand(0).getReg(); 6772 Register SrcVec = MI.getOperand(1).getReg(); 6773 Register InsertVal; 6774 if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT) 6775 InsertVal = MI.getOperand(2).getReg(); 6776 6777 Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg(); 6778 6779 LLT VecTy = MRI.getType(SrcVec); 6780 LLT EltTy = VecTy.getElementType(); 6781 if (!EltTy.isByteSized()) { // Not implemented. 6782 LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n"); 6783 return UnableToLegalize; 6784 } 6785 6786 unsigned EltBytes = EltTy.getSizeInBytes(); 6787 Align VecAlign = getStackTemporaryAlignment(VecTy); 6788 Align EltAlign; 6789 6790 MachinePointerInfo PtrInfo; 6791 auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()), 6792 VecAlign, PtrInfo); 6793 MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign); 6794 6795 // Get the pointer to the element, and be sure not to hit undefined behavior 6796 // if the index is out of bounds. 6797 Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx); 6798 6799 int64_t IdxVal; 6800 if (mi_match(Idx, MRI, m_ICst(IdxVal))) { 6801 int64_t Offset = IdxVal * EltBytes; 6802 PtrInfo = PtrInfo.getWithOffset(Offset); 6803 EltAlign = commonAlignment(VecAlign, Offset); 6804 } else { 6805 // We lose information with a variable offset. 6806 EltAlign = getStackTemporaryAlignment(EltTy); 6807 PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace()); 6808 } 6809 6810 if (InsertVal) { 6811 // Write the inserted element 6812 MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign); 6813 6814 // Reload the whole vector. 6815 MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign); 6816 } else { 6817 MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign); 6818 } 6819 6820 MI.eraseFromParent(); 6821 return Legalized; 6822 } 6823 6824 LegalizerHelper::LegalizeResult 6825 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) { 6826 Register DstReg = MI.getOperand(0).getReg(); 6827 Register Src0Reg = MI.getOperand(1).getReg(); 6828 Register Src1Reg = MI.getOperand(2).getReg(); 6829 LLT Src0Ty = MRI.getType(Src0Reg); 6830 LLT DstTy = MRI.getType(DstReg); 6831 LLT IdxTy = LLT::scalar(32); 6832 6833 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); 6834 6835 if (DstTy.isScalar()) { 6836 if (Src0Ty.isVector()) 6837 return UnableToLegalize; 6838 6839 // This is just a SELECT. 6840 assert(Mask.size() == 1 && "Expected a single mask element"); 6841 Register Val; 6842 if (Mask[0] < 0 || Mask[0] > 1) 6843 Val = MIRBuilder.buildUndef(DstTy).getReg(0); 6844 else 6845 Val = Mask[0] == 0 ? Src0Reg : Src1Reg; 6846 MIRBuilder.buildCopy(DstReg, Val); 6847 MI.eraseFromParent(); 6848 return Legalized; 6849 } 6850 6851 Register Undef; 6852 SmallVector<Register, 32> BuildVec; 6853 LLT EltTy = DstTy.getElementType(); 6854 6855 for (int Idx : Mask) { 6856 if (Idx < 0) { 6857 if (!Undef.isValid()) 6858 Undef = MIRBuilder.buildUndef(EltTy).getReg(0); 6859 BuildVec.push_back(Undef); 6860 continue; 6861 } 6862 6863 if (Src0Ty.isScalar()) { 6864 BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg); 6865 } else { 6866 int NumElts = Src0Ty.getNumElements(); 6867 Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg; 6868 int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts; 6869 auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx); 6870 auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK); 6871 BuildVec.push_back(Extract.getReg(0)); 6872 } 6873 } 6874 6875 MIRBuilder.buildBuildVector(DstReg, BuildVec); 6876 MI.eraseFromParent(); 6877 return Legalized; 6878 } 6879 6880 LegalizerHelper::LegalizeResult 6881 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { 6882 const auto &MF = *MI.getMF(); 6883 const auto &TFI = *MF.getSubtarget().getFrameLowering(); 6884 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp) 6885 return UnableToLegalize; 6886 6887 Register Dst = MI.getOperand(0).getReg(); 6888 Register AllocSize = MI.getOperand(1).getReg(); 6889 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 6890 6891 LLT PtrTy = MRI.getType(Dst); 6892 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 6893 6894 Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); 6895 auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg); 6896 SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp); 6897 6898 // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't 6899 // have to generate an extra instruction to negate the alloc and then use 6900 // G_PTR_ADD to add the negative offset. 6901 auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize); 6902 if (Alignment > Align(1)) { 6903 APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true); 6904 AlignMask.negate(); 6905 auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask); 6906 Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst); 6907 } 6908 6909 SPTmp = MIRBuilder.buildCast(PtrTy, Alloc); 6910 MIRBuilder.buildCopy(SPReg, SPTmp); 6911 MIRBuilder.buildCopy(Dst, SPTmp); 6912 6913 MI.eraseFromParent(); 6914 return Legalized; 6915 } 6916 6917 LegalizerHelper::LegalizeResult 6918 LegalizerHelper::lowerExtract(MachineInstr &MI) { 6919 Register Dst = MI.getOperand(0).getReg(); 6920 Register Src = MI.getOperand(1).getReg(); 6921 unsigned Offset = MI.getOperand(2).getImm(); 6922 6923 LLT DstTy = MRI.getType(Dst); 6924 LLT SrcTy = MRI.getType(Src); 6925 6926 if (DstTy.isScalar() && 6927 (SrcTy.isScalar() || 6928 (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) { 6929 LLT SrcIntTy = SrcTy; 6930 if (!SrcTy.isScalar()) { 6931 SrcIntTy = LLT::scalar(SrcTy.getSizeInBits()); 6932 Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0); 6933 } 6934 6935 if (Offset == 0) 6936 MIRBuilder.buildTrunc(Dst, Src); 6937 else { 6938 auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset); 6939 auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt); 6940 MIRBuilder.buildTrunc(Dst, Shr); 6941 } 6942 6943 MI.eraseFromParent(); 6944 return Legalized; 6945 } 6946 6947 return UnableToLegalize; 6948 } 6949 6950 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) { 6951 Register Dst = MI.getOperand(0).getReg(); 6952 Register Src = MI.getOperand(1).getReg(); 6953 Register InsertSrc = MI.getOperand(2).getReg(); 6954 uint64_t Offset = MI.getOperand(3).getImm(); 6955 6956 LLT DstTy = MRI.getType(Src); 6957 LLT InsertTy = MRI.getType(InsertSrc); 6958 6959 if (InsertTy.isVector() || 6960 (DstTy.isVector() && DstTy.getElementType() != InsertTy)) 6961 return UnableToLegalize; 6962 6963 const DataLayout &DL = MIRBuilder.getDataLayout(); 6964 if ((DstTy.isPointer() && 6965 DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) || 6966 (InsertTy.isPointer() && 6967 DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) { 6968 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n"); 6969 return UnableToLegalize; 6970 } 6971 6972 LLT IntDstTy = DstTy; 6973 6974 if (!DstTy.isScalar()) { 6975 IntDstTy = LLT::scalar(DstTy.getSizeInBits()); 6976 Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0); 6977 } 6978 6979 if (!InsertTy.isScalar()) { 6980 const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits()); 6981 InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0); 6982 } 6983 6984 Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0); 6985 if (Offset != 0) { 6986 auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset); 6987 ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0); 6988 } 6989 6990 APInt MaskVal = APInt::getBitsSetWithWrap( 6991 DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset); 6992 6993 auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal); 6994 auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask); 6995 auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc); 6996 6997 MIRBuilder.buildCast(Dst, Or); 6998 MI.eraseFromParent(); 6999 return Legalized; 7000 } 7001 7002 LegalizerHelper::LegalizeResult 7003 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) { 7004 Register Dst0 = MI.getOperand(0).getReg(); 7005 Register Dst1 = MI.getOperand(1).getReg(); 7006 Register LHS = MI.getOperand(2).getReg(); 7007 Register RHS = MI.getOperand(3).getReg(); 7008 const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO; 7009 7010 LLT Ty = MRI.getType(Dst0); 7011 LLT BoolTy = MRI.getType(Dst1); 7012 7013 if (IsAdd) 7014 MIRBuilder.buildAdd(Dst0, LHS, RHS); 7015 else 7016 MIRBuilder.buildSub(Dst0, LHS, RHS); 7017 7018 // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow. 7019 7020 auto Zero = MIRBuilder.buildConstant(Ty, 0); 7021 7022 // For an addition, the result should be less than one of the operands (LHS) 7023 // if and only if the other operand (RHS) is negative, otherwise there will 7024 // be overflow. 7025 // For a subtraction, the result should be less than one of the operands 7026 // (LHS) if and only if the other operand (RHS) is (non-zero) positive, 7027 // otherwise there will be overflow. 7028 auto ResultLowerThanLHS = 7029 MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS); 7030 auto ConditionRHS = MIRBuilder.buildICmp( 7031 IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero); 7032 7033 MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS); 7034 MI.eraseFromParent(); 7035 return Legalized; 7036 } 7037 7038 LegalizerHelper::LegalizeResult 7039 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) { 7040 Register Res = MI.getOperand(0).getReg(); 7041 Register LHS = MI.getOperand(1).getReg(); 7042 Register RHS = MI.getOperand(2).getReg(); 7043 LLT Ty = MRI.getType(Res); 7044 bool IsSigned; 7045 bool IsAdd; 7046 unsigned BaseOp; 7047 switch (MI.getOpcode()) { 7048 default: 7049 llvm_unreachable("unexpected addsat/subsat opcode"); 7050 case TargetOpcode::G_UADDSAT: 7051 IsSigned = false; 7052 IsAdd = true; 7053 BaseOp = TargetOpcode::G_ADD; 7054 break; 7055 case TargetOpcode::G_SADDSAT: 7056 IsSigned = true; 7057 IsAdd = true; 7058 BaseOp = TargetOpcode::G_ADD; 7059 break; 7060 case TargetOpcode::G_USUBSAT: 7061 IsSigned = false; 7062 IsAdd = false; 7063 BaseOp = TargetOpcode::G_SUB; 7064 break; 7065 case TargetOpcode::G_SSUBSAT: 7066 IsSigned = true; 7067 IsAdd = false; 7068 BaseOp = TargetOpcode::G_SUB; 7069 break; 7070 } 7071 7072 if (IsSigned) { 7073 // sadd.sat(a, b) -> 7074 // hi = 0x7fffffff - smax(a, 0) 7075 // lo = 0x80000000 - smin(a, 0) 7076 // a + smin(smax(lo, b), hi) 7077 // ssub.sat(a, b) -> 7078 // lo = smax(a, -1) - 0x7fffffff 7079 // hi = smin(a, -1) - 0x80000000 7080 // a - smin(smax(lo, b), hi) 7081 // TODO: AMDGPU can use a "median of 3" instruction here: 7082 // a +/- med3(lo, b, hi) 7083 uint64_t NumBits = Ty.getScalarSizeInBits(); 7084 auto MaxVal = 7085 MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits)); 7086 auto MinVal = 7087 MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits)); 7088 MachineInstrBuilder Hi, Lo; 7089 if (IsAdd) { 7090 auto Zero = MIRBuilder.buildConstant(Ty, 0); 7091 Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero)); 7092 Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero)); 7093 } else { 7094 auto NegOne = MIRBuilder.buildConstant(Ty, -1); 7095 Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne), 7096 MaxVal); 7097 Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne), 7098 MinVal); 7099 } 7100 auto RHSClamped = 7101 MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi); 7102 MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped}); 7103 } else { 7104 // uadd.sat(a, b) -> a + umin(~a, b) 7105 // usub.sat(a, b) -> a - umin(a, b) 7106 Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS; 7107 auto Min = MIRBuilder.buildUMin(Ty, Not, RHS); 7108 MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min}); 7109 } 7110 7111 MI.eraseFromParent(); 7112 return Legalized; 7113 } 7114 7115 LegalizerHelper::LegalizeResult 7116 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) { 7117 Register Res = MI.getOperand(0).getReg(); 7118 Register LHS = MI.getOperand(1).getReg(); 7119 Register RHS = MI.getOperand(2).getReg(); 7120 LLT Ty = MRI.getType(Res); 7121 LLT BoolTy = Ty.changeElementSize(1); 7122 bool IsSigned; 7123 bool IsAdd; 7124 unsigned OverflowOp; 7125 switch (MI.getOpcode()) { 7126 default: 7127 llvm_unreachable("unexpected addsat/subsat opcode"); 7128 case TargetOpcode::G_UADDSAT: 7129 IsSigned = false; 7130 IsAdd = true; 7131 OverflowOp = TargetOpcode::G_UADDO; 7132 break; 7133 case TargetOpcode::G_SADDSAT: 7134 IsSigned = true; 7135 IsAdd = true; 7136 OverflowOp = TargetOpcode::G_SADDO; 7137 break; 7138 case TargetOpcode::G_USUBSAT: 7139 IsSigned = false; 7140 IsAdd = false; 7141 OverflowOp = TargetOpcode::G_USUBO; 7142 break; 7143 case TargetOpcode::G_SSUBSAT: 7144 IsSigned = true; 7145 IsAdd = false; 7146 OverflowOp = TargetOpcode::G_SSUBO; 7147 break; 7148 } 7149 7150 auto OverflowRes = 7151 MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS}); 7152 Register Tmp = OverflowRes.getReg(0); 7153 Register Ov = OverflowRes.getReg(1); 7154 MachineInstrBuilder Clamp; 7155 if (IsSigned) { 7156 // sadd.sat(a, b) -> 7157 // {tmp, ov} = saddo(a, b) 7158 // ov ? (tmp >>s 31) + 0x80000000 : r 7159 // ssub.sat(a, b) -> 7160 // {tmp, ov} = ssubo(a, b) 7161 // ov ? (tmp >>s 31) + 0x80000000 : r 7162 uint64_t NumBits = Ty.getScalarSizeInBits(); 7163 auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1); 7164 auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount); 7165 auto MinVal = 7166 MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits)); 7167 Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal); 7168 } else { 7169 // uadd.sat(a, b) -> 7170 // {tmp, ov} = uaddo(a, b) 7171 // ov ? 0xffffffff : tmp 7172 // usub.sat(a, b) -> 7173 // {tmp, ov} = usubo(a, b) 7174 // ov ? 0 : tmp 7175 Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0); 7176 } 7177 MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp); 7178 7179 MI.eraseFromParent(); 7180 return Legalized; 7181 } 7182 7183 LegalizerHelper::LegalizeResult 7184 LegalizerHelper::lowerShlSat(MachineInstr &MI) { 7185 assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT || 7186 MI.getOpcode() == TargetOpcode::G_USHLSAT) && 7187 "Expected shlsat opcode!"); 7188 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT; 7189 Register Res = MI.getOperand(0).getReg(); 7190 Register LHS = MI.getOperand(1).getReg(); 7191 Register RHS = MI.getOperand(2).getReg(); 7192 LLT Ty = MRI.getType(Res); 7193 LLT BoolTy = Ty.changeElementSize(1); 7194 7195 unsigned BW = Ty.getScalarSizeInBits(); 7196 auto Result = MIRBuilder.buildShl(Ty, LHS, RHS); 7197 auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS) 7198 : MIRBuilder.buildLShr(Ty, Result, RHS); 7199 7200 MachineInstrBuilder SatVal; 7201 if (IsSigned) { 7202 auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW)); 7203 auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW)); 7204 auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS, 7205 MIRBuilder.buildConstant(Ty, 0)); 7206 SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax); 7207 } else { 7208 SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW)); 7209 } 7210 auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig); 7211 MIRBuilder.buildSelect(Res, Ov, SatVal, Result); 7212 7213 MI.eraseFromParent(); 7214 return Legalized; 7215 } 7216 7217 LegalizerHelper::LegalizeResult 7218 LegalizerHelper::lowerBswap(MachineInstr &MI) { 7219 Register Dst = MI.getOperand(0).getReg(); 7220 Register Src = MI.getOperand(1).getReg(); 7221 const LLT Ty = MRI.getType(Src); 7222 unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8; 7223 unsigned BaseShiftAmt = (SizeInBytes - 1) * 8; 7224 7225 // Swap most and least significant byte, set remaining bytes in Res to zero. 7226 auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt); 7227 auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt); 7228 auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt); 7229 auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft); 7230 7231 // Set i-th high/low byte in Res to i-th low/high byte from Src. 7232 for (unsigned i = 1; i < SizeInBytes / 2; ++i) { 7233 // AND with Mask leaves byte i unchanged and sets remaining bytes to 0. 7234 APInt APMask(SizeInBytes * 8, 0xFF << (i * 8)); 7235 auto Mask = MIRBuilder.buildConstant(Ty, APMask); 7236 auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i); 7237 // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt. 7238 auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask); 7239 auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt); 7240 Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft); 7241 // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask. 7242 auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt); 7243 auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask); 7244 Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight); 7245 } 7246 Res.getInstr()->getOperand(0).setReg(Dst); 7247 7248 MI.eraseFromParent(); 7249 return Legalized; 7250 } 7251 7252 //{ (Src & Mask) >> N } | { (Src << N) & Mask } 7253 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B, 7254 MachineInstrBuilder Src, APInt Mask) { 7255 const LLT Ty = Dst.getLLTTy(*B.getMRI()); 7256 MachineInstrBuilder C_N = B.buildConstant(Ty, N); 7257 MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask); 7258 auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N); 7259 auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0); 7260 return B.buildOr(Dst, LHS, RHS); 7261 } 7262 7263 LegalizerHelper::LegalizeResult 7264 LegalizerHelper::lowerBitreverse(MachineInstr &MI) { 7265 Register Dst = MI.getOperand(0).getReg(); 7266 Register Src = MI.getOperand(1).getReg(); 7267 const LLT Ty = MRI.getType(Src); 7268 unsigned Size = Ty.getSizeInBits(); 7269 7270 MachineInstrBuilder BSWAP = 7271 MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src}); 7272 7273 // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654 7274 // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4] 7275 // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0] 7276 MachineInstrBuilder Swap4 = 7277 SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0))); 7278 7279 // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76 7280 // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2] 7281 // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC] 7282 MachineInstrBuilder Swap2 = 7283 SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC))); 7284 7285 // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7 7286 // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1] 7287 // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA] 7288 SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA))); 7289 7290 MI.eraseFromParent(); 7291 return Legalized; 7292 } 7293 7294 LegalizerHelper::LegalizeResult 7295 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) { 7296 MachineFunction &MF = MIRBuilder.getMF(); 7297 7298 bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER; 7299 int NameOpIdx = IsRead ? 1 : 0; 7300 int ValRegIndex = IsRead ? 0 : 1; 7301 7302 Register ValReg = MI.getOperand(ValRegIndex).getReg(); 7303 const LLT Ty = MRI.getType(ValReg); 7304 const MDString *RegStr = cast<MDString>( 7305 cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0)); 7306 7307 Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF); 7308 if (!PhysReg.isValid()) 7309 return UnableToLegalize; 7310 7311 if (IsRead) 7312 MIRBuilder.buildCopy(ValReg, PhysReg); 7313 else 7314 MIRBuilder.buildCopy(PhysReg, ValReg); 7315 7316 MI.eraseFromParent(); 7317 return Legalized; 7318 } 7319 7320 LegalizerHelper::LegalizeResult 7321 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) { 7322 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH; 7323 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; 7324 Register Result = MI.getOperand(0).getReg(); 7325 LLT OrigTy = MRI.getType(Result); 7326 auto SizeInBits = OrigTy.getScalarSizeInBits(); 7327 LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2); 7328 7329 auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)}); 7330 auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)}); 7331 auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS); 7332 unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR; 7333 7334 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits); 7335 auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt}); 7336 MIRBuilder.buildTrunc(Result, Shifted); 7337 7338 MI.eraseFromParent(); 7339 return Legalized; 7340 } 7341 7342 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) { 7343 // Implement vector G_SELECT in terms of XOR, AND, OR. 7344 Register DstReg = MI.getOperand(0).getReg(); 7345 Register MaskReg = MI.getOperand(1).getReg(); 7346 Register Op1Reg = MI.getOperand(2).getReg(); 7347 Register Op2Reg = MI.getOperand(3).getReg(); 7348 LLT DstTy = MRI.getType(DstReg); 7349 LLT MaskTy = MRI.getType(MaskReg); 7350 LLT Op1Ty = MRI.getType(Op1Reg); 7351 if (!DstTy.isVector()) 7352 return UnableToLegalize; 7353 7354 // Vector selects can have a scalar predicate. If so, splat into a vector and 7355 // finish for later legalization attempts to try again. 7356 if (MaskTy.isScalar()) { 7357 Register MaskElt = MaskReg; 7358 if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits()) 7359 MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0); 7360 // Generate a vector splat idiom to be pattern matched later. 7361 auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt); 7362 Observer.changingInstr(MI); 7363 MI.getOperand(1).setReg(ShufSplat.getReg(0)); 7364 Observer.changedInstr(MI); 7365 return Legalized; 7366 } 7367 7368 if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) { 7369 return UnableToLegalize; 7370 } 7371 7372 auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg); 7373 auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg); 7374 auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask); 7375 MIRBuilder.buildOr(DstReg, NewOp1, NewOp2); 7376 MI.eraseFromParent(); 7377 return Legalized; 7378 } 7379 7380 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) { 7381 // Split DIVREM into individual instructions. 7382 unsigned Opcode = MI.getOpcode(); 7383 7384 MIRBuilder.buildInstr( 7385 Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV 7386 : TargetOpcode::G_UDIV, 7387 {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)}); 7388 MIRBuilder.buildInstr( 7389 Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM 7390 : TargetOpcode::G_UREM, 7391 {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)}); 7392 MI.eraseFromParent(); 7393 return Legalized; 7394 } 7395 7396 LegalizerHelper::LegalizeResult 7397 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) { 7398 // Expand %res = G_ABS %a into: 7399 // %v1 = G_ASHR %a, scalar_size-1 7400 // %v2 = G_ADD %a, %v1 7401 // %res = G_XOR %v2, %v1 7402 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 7403 Register OpReg = MI.getOperand(1).getReg(); 7404 auto ShiftAmt = 7405 MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1); 7406 auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt); 7407 auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift); 7408 MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift); 7409 MI.eraseFromParent(); 7410 return Legalized; 7411 } 7412 7413 LegalizerHelper::LegalizeResult 7414 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) { 7415 // Expand %res = G_ABS %a into: 7416 // %v1 = G_CONSTANT 0 7417 // %v2 = G_SUB %v1, %a 7418 // %res = G_SMAX %a, %v2 7419 Register SrcReg = MI.getOperand(1).getReg(); 7420 LLT Ty = MRI.getType(SrcReg); 7421 auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0); 7422 auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0); 7423 MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub); 7424 MI.eraseFromParent(); 7425 return Legalized; 7426 } 7427 7428 LegalizerHelper::LegalizeResult 7429 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) { 7430 Register SrcReg = MI.getOperand(1).getReg(); 7431 LLT SrcTy = MRI.getType(SrcReg); 7432 LLT DstTy = MRI.getType(SrcReg); 7433 7434 // The source could be a scalar if the IR type was <1 x sN>. 7435 if (SrcTy.isScalar()) { 7436 if (DstTy.getSizeInBits() > SrcTy.getSizeInBits()) 7437 return UnableToLegalize; // FIXME: handle extension. 7438 // This can be just a plain copy. 7439 Observer.changingInstr(MI); 7440 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY)); 7441 Observer.changedInstr(MI); 7442 return Legalized; 7443 } 7444 return UnableToLegalize;; 7445 } 7446 7447 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { 7448 // On Darwin, -Os means optimize for size without hurting performance, so 7449 // only really optimize for size when -Oz (MinSize) is used. 7450 if (MF.getTarget().getTargetTriple().isOSDarwin()) 7451 return MF.getFunction().hasMinSize(); 7452 return MF.getFunction().hasOptSize(); 7453 } 7454 7455 // Returns a list of types to use for memory op lowering in MemOps. A partial 7456 // port of findOptimalMemOpLowering in TargetLowering. 7457 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps, 7458 unsigned Limit, const MemOp &Op, 7459 unsigned DstAS, unsigned SrcAS, 7460 const AttributeList &FuncAttributes, 7461 const TargetLowering &TLI) { 7462 if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign()) 7463 return false; 7464 7465 LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes); 7466 7467 if (Ty == LLT()) { 7468 // Use the largest scalar type whose alignment constraints are satisfied. 7469 // We only need to check DstAlign here as SrcAlign is always greater or 7470 // equal to DstAlign (or zero). 7471 Ty = LLT::scalar(64); 7472 if (Op.isFixedDstAlign()) 7473 while (Op.getDstAlign() < Ty.getSizeInBytes() && 7474 !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign())) 7475 Ty = LLT::scalar(Ty.getSizeInBytes()); 7476 assert(Ty.getSizeInBits() > 0 && "Could not find valid type"); 7477 // FIXME: check for the largest legal type we can load/store to. 7478 } 7479 7480 unsigned NumMemOps = 0; 7481 uint64_t Size = Op.size(); 7482 while (Size) { 7483 unsigned TySize = Ty.getSizeInBytes(); 7484 while (TySize > Size) { 7485 // For now, only use non-vector load / store's for the left-over pieces. 7486 LLT NewTy = Ty; 7487 // FIXME: check for mem op safety and legality of the types. Not all of 7488 // SDAGisms map cleanly to GISel concepts. 7489 if (NewTy.isVector()) 7490 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32); 7491 NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1)); 7492 unsigned NewTySize = NewTy.getSizeInBytes(); 7493 assert(NewTySize > 0 && "Could not find appropriate type"); 7494 7495 // If the new LLT cannot cover all of the remaining bits, then consider 7496 // issuing a (or a pair of) unaligned and overlapping load / store. 7497 bool Fast; 7498 // Need to get a VT equivalent for allowMisalignedMemoryAccesses(). 7499 MVT VT = getMVTForLLT(Ty); 7500 if (NumMemOps && Op.allowOverlap() && NewTySize < Size && 7501 TLI.allowsMisalignedMemoryAccesses( 7502 VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1), 7503 MachineMemOperand::MONone, &Fast) && 7504 Fast) 7505 TySize = Size; 7506 else { 7507 Ty = NewTy; 7508 TySize = NewTySize; 7509 } 7510 } 7511 7512 if (++NumMemOps > Limit) 7513 return false; 7514 7515 MemOps.push_back(Ty); 7516 Size -= TySize; 7517 } 7518 7519 return true; 7520 } 7521 7522 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) { 7523 if (Ty.isVector()) 7524 return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()), 7525 Ty.getNumElements()); 7526 return IntegerType::get(C, Ty.getSizeInBits()); 7527 } 7528 7529 // Get a vectorized representation of the memset value operand, GISel edition. 7530 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) { 7531 MachineRegisterInfo &MRI = *MIB.getMRI(); 7532 unsigned NumBits = Ty.getScalarSizeInBits(); 7533 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI); 7534 if (!Ty.isVector() && ValVRegAndVal) { 7535 APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8); 7536 APInt SplatVal = APInt::getSplat(NumBits, Scalar); 7537 return MIB.buildConstant(Ty, SplatVal).getReg(0); 7538 } 7539 7540 // Extend the byte value to the larger type, and then multiply by a magic 7541 // value 0x010101... in order to replicate it across every byte. 7542 // Unless it's zero, in which case just emit a larger G_CONSTANT 0. 7543 if (ValVRegAndVal && ValVRegAndVal->Value == 0) { 7544 return MIB.buildConstant(Ty, 0).getReg(0); 7545 } 7546 7547 LLT ExtType = Ty.getScalarType(); 7548 auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val); 7549 if (NumBits > 8) { 7550 APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01)); 7551 auto MagicMI = MIB.buildConstant(ExtType, Magic); 7552 Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0); 7553 } 7554 7555 // For vector types create a G_BUILD_VECTOR. 7556 if (Ty.isVector()) 7557 Val = MIB.buildSplatVector(Ty, Val).getReg(0); 7558 7559 return Val; 7560 } 7561 7562 LegalizerHelper::LegalizeResult 7563 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val, 7564 uint64_t KnownLen, Align Alignment, 7565 bool IsVolatile) { 7566 auto &MF = *MI.getParent()->getParent(); 7567 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 7568 auto &DL = MF.getDataLayout(); 7569 LLVMContext &C = MF.getFunction().getContext(); 7570 7571 assert(KnownLen != 0 && "Have a zero length memset length!"); 7572 7573 bool DstAlignCanChange = false; 7574 MachineFrameInfo &MFI = MF.getFrameInfo(); 7575 bool OptSize = shouldLowerMemFuncForSize(MF); 7576 7577 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 7578 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 7579 DstAlignCanChange = true; 7580 7581 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize); 7582 std::vector<LLT> MemOps; 7583 7584 const auto &DstMMO = **MI.memoperands_begin(); 7585 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 7586 7587 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI); 7588 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0; 7589 7590 if (!findGISelOptimalMemOpLowering(MemOps, Limit, 7591 MemOp::Set(KnownLen, DstAlignCanChange, 7592 Alignment, 7593 /*IsZeroMemset=*/IsZeroVal, 7594 /*IsVolatile=*/IsVolatile), 7595 DstPtrInfo.getAddrSpace(), ~0u, 7596 MF.getFunction().getAttributes(), TLI)) 7597 return UnableToLegalize; 7598 7599 if (DstAlignCanChange) { 7600 // Get an estimate of the type from the LLT. 7601 Type *IRTy = getTypeForLLT(MemOps[0], C); 7602 Align NewAlign = DL.getABITypeAlign(IRTy); 7603 if (NewAlign > Alignment) { 7604 Alignment = NewAlign; 7605 unsigned FI = FIDef->getOperand(1).getIndex(); 7606 // Give the stack frame object a larger alignment if needed. 7607 if (MFI.getObjectAlign(FI) < Alignment) 7608 MFI.setObjectAlignment(FI, Alignment); 7609 } 7610 } 7611 7612 MachineIRBuilder MIB(MI); 7613 // Find the largest store and generate the bit pattern for it. 7614 LLT LargestTy = MemOps[0]; 7615 for (unsigned i = 1; i < MemOps.size(); i++) 7616 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits()) 7617 LargestTy = MemOps[i]; 7618 7619 // The memset stored value is always defined as an s8, so in order to make it 7620 // work with larger store types we need to repeat the bit pattern across the 7621 // wider type. 7622 Register MemSetValue = getMemsetValue(Val, LargestTy, MIB); 7623 7624 if (!MemSetValue) 7625 return UnableToLegalize; 7626 7627 // Generate the stores. For each store type in the list, we generate the 7628 // matching store of that type to the destination address. 7629 LLT PtrTy = MRI.getType(Dst); 7630 unsigned DstOff = 0; 7631 unsigned Size = KnownLen; 7632 for (unsigned I = 0; I < MemOps.size(); I++) { 7633 LLT Ty = MemOps[I]; 7634 unsigned TySize = Ty.getSizeInBytes(); 7635 if (TySize > Size) { 7636 // Issuing an unaligned load / store pair that overlaps with the previous 7637 // pair. Adjust the offset accordingly. 7638 assert(I == MemOps.size() - 1 && I != 0); 7639 DstOff -= TySize - Size; 7640 } 7641 7642 // If this store is smaller than the largest store see whether we can get 7643 // the smaller value for free with a truncate. 7644 Register Value = MemSetValue; 7645 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) { 7646 MVT VT = getMVTForLLT(Ty); 7647 MVT LargestVT = getMVTForLLT(LargestTy); 7648 if (!LargestTy.isVector() && !Ty.isVector() && 7649 TLI.isTruncateFree(LargestVT, VT)) 7650 Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0); 7651 else 7652 Value = getMemsetValue(Val, Ty, MIB); 7653 if (!Value) 7654 return UnableToLegalize; 7655 } 7656 7657 auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty); 7658 7659 Register Ptr = Dst; 7660 if (DstOff != 0) { 7661 auto Offset = 7662 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff); 7663 Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0); 7664 } 7665 7666 MIB.buildStore(Value, Ptr, *StoreMMO); 7667 DstOff += Ty.getSizeInBytes(); 7668 Size -= TySize; 7669 } 7670 7671 MI.eraseFromParent(); 7672 return Legalized; 7673 } 7674 7675 LegalizerHelper::LegalizeResult 7676 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) { 7677 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE); 7678 7679 Register Dst = MI.getOperand(0).getReg(); 7680 Register Src = MI.getOperand(1).getReg(); 7681 Register Len = MI.getOperand(2).getReg(); 7682 7683 const auto *MMOIt = MI.memoperands_begin(); 7684 const MachineMemOperand *MemOp = *MMOIt; 7685 bool IsVolatile = MemOp->isVolatile(); 7686 7687 // See if this is a constant length copy 7688 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI); 7689 // FIXME: support dynamically sized G_MEMCPY_INLINE 7690 assert(LenVRegAndVal.hasValue() && 7691 "inline memcpy with dynamic size is not yet supported"); 7692 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue(); 7693 if (KnownLen == 0) { 7694 MI.eraseFromParent(); 7695 return Legalized; 7696 } 7697 7698 const auto &DstMMO = **MI.memoperands_begin(); 7699 const auto &SrcMMO = **std::next(MI.memoperands_begin()); 7700 Align DstAlign = DstMMO.getBaseAlign(); 7701 Align SrcAlign = SrcMMO.getBaseAlign(); 7702 7703 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, 7704 IsVolatile); 7705 } 7706 7707 LegalizerHelper::LegalizeResult 7708 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src, 7709 uint64_t KnownLen, Align DstAlign, 7710 Align SrcAlign, bool IsVolatile) { 7711 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE); 7712 return lowerMemcpy(MI, Dst, Src, KnownLen, 7713 std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign, 7714 IsVolatile); 7715 } 7716 7717 LegalizerHelper::LegalizeResult 7718 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src, 7719 uint64_t KnownLen, uint64_t Limit, Align DstAlign, 7720 Align SrcAlign, bool IsVolatile) { 7721 auto &MF = *MI.getParent()->getParent(); 7722 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 7723 auto &DL = MF.getDataLayout(); 7724 LLVMContext &C = MF.getFunction().getContext(); 7725 7726 assert(KnownLen != 0 && "Have a zero length memcpy length!"); 7727 7728 bool DstAlignCanChange = false; 7729 MachineFrameInfo &MFI = MF.getFrameInfo(); 7730 Align Alignment = commonAlignment(DstAlign, SrcAlign); 7731 7732 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 7733 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 7734 DstAlignCanChange = true; 7735 7736 // FIXME: infer better src pointer alignment like SelectionDAG does here. 7737 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining 7738 // if the memcpy is in a tail call position. 7739 7740 std::vector<LLT> MemOps; 7741 7742 const auto &DstMMO = **MI.memoperands_begin(); 7743 const auto &SrcMMO = **std::next(MI.memoperands_begin()); 7744 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 7745 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); 7746 7747 if (!findGISelOptimalMemOpLowering( 7748 MemOps, Limit, 7749 MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign, 7750 IsVolatile), 7751 DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(), 7752 MF.getFunction().getAttributes(), TLI)) 7753 return UnableToLegalize; 7754 7755 if (DstAlignCanChange) { 7756 // Get an estimate of the type from the LLT. 7757 Type *IRTy = getTypeForLLT(MemOps[0], C); 7758 Align NewAlign = DL.getABITypeAlign(IRTy); 7759 7760 // Don't promote to an alignment that would require dynamic stack 7761 // realignment. 7762 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 7763 if (!TRI->hasStackRealignment(MF)) 7764 while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign)) 7765 NewAlign = NewAlign / 2; 7766 7767 if (NewAlign > Alignment) { 7768 Alignment = NewAlign; 7769 unsigned FI = FIDef->getOperand(1).getIndex(); 7770 // Give the stack frame object a larger alignment if needed. 7771 if (MFI.getObjectAlign(FI) < Alignment) 7772 MFI.setObjectAlignment(FI, Alignment); 7773 } 7774 } 7775 7776 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n"); 7777 7778 MachineIRBuilder MIB(MI); 7779 // Now we need to emit a pair of load and stores for each of the types we've 7780 // collected. I.e. for each type, generate a load from the source pointer of 7781 // that type width, and then generate a corresponding store to the dest buffer 7782 // of that value loaded. This can result in a sequence of loads and stores 7783 // mixed types, depending on what the target specifies as good types to use. 7784 unsigned CurrOffset = 0; 7785 LLT PtrTy = MRI.getType(Src); 7786 unsigned Size = KnownLen; 7787 for (auto CopyTy : MemOps) { 7788 // Issuing an unaligned load / store pair that overlaps with the previous 7789 // pair. Adjust the offset accordingly. 7790 if (CopyTy.getSizeInBytes() > Size) 7791 CurrOffset -= CopyTy.getSizeInBytes() - Size; 7792 7793 // Construct MMOs for the accesses. 7794 auto *LoadMMO = 7795 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); 7796 auto *StoreMMO = 7797 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); 7798 7799 // Create the load. 7800 Register LoadPtr = Src; 7801 Register Offset; 7802 if (CurrOffset != 0) { 7803 Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset) 7804 .getReg(0); 7805 LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0); 7806 } 7807 auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO); 7808 7809 // Create the store. 7810 Register StorePtr = 7811 CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0); 7812 MIB.buildStore(LdVal, StorePtr, *StoreMMO); 7813 CurrOffset += CopyTy.getSizeInBytes(); 7814 Size -= CopyTy.getSizeInBytes(); 7815 } 7816 7817 MI.eraseFromParent(); 7818 return Legalized; 7819 } 7820 7821 LegalizerHelper::LegalizeResult 7822 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, 7823 uint64_t KnownLen, Align DstAlign, Align SrcAlign, 7824 bool IsVolatile) { 7825 auto &MF = *MI.getParent()->getParent(); 7826 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 7827 auto &DL = MF.getDataLayout(); 7828 LLVMContext &C = MF.getFunction().getContext(); 7829 7830 assert(KnownLen != 0 && "Have a zero length memmove length!"); 7831 7832 bool DstAlignCanChange = false; 7833 MachineFrameInfo &MFI = MF.getFrameInfo(); 7834 bool OptSize = shouldLowerMemFuncForSize(MF); 7835 Align Alignment = commonAlignment(DstAlign, SrcAlign); 7836 7837 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 7838 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 7839 DstAlignCanChange = true; 7840 7841 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize); 7842 std::vector<LLT> MemOps; 7843 7844 const auto &DstMMO = **MI.memoperands_begin(); 7845 const auto &SrcMMO = **std::next(MI.memoperands_begin()); 7846 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 7847 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); 7848 7849 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due 7850 // to a bug in it's findOptimalMemOpLowering implementation. For now do the 7851 // same thing here. 7852 if (!findGISelOptimalMemOpLowering( 7853 MemOps, Limit, 7854 MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign, 7855 /*IsVolatile*/ true), 7856 DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(), 7857 MF.getFunction().getAttributes(), TLI)) 7858 return UnableToLegalize; 7859 7860 if (DstAlignCanChange) { 7861 // Get an estimate of the type from the LLT. 7862 Type *IRTy = getTypeForLLT(MemOps[0], C); 7863 Align NewAlign = DL.getABITypeAlign(IRTy); 7864 7865 // Don't promote to an alignment that would require dynamic stack 7866 // realignment. 7867 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 7868 if (!TRI->hasStackRealignment(MF)) 7869 while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign)) 7870 NewAlign = NewAlign / 2; 7871 7872 if (NewAlign > Alignment) { 7873 Alignment = NewAlign; 7874 unsigned FI = FIDef->getOperand(1).getIndex(); 7875 // Give the stack frame object a larger alignment if needed. 7876 if (MFI.getObjectAlign(FI) < Alignment) 7877 MFI.setObjectAlignment(FI, Alignment); 7878 } 7879 } 7880 7881 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n"); 7882 7883 MachineIRBuilder MIB(MI); 7884 // Memmove requires that we perform the loads first before issuing the stores. 7885 // Apart from that, this loop is pretty much doing the same thing as the 7886 // memcpy codegen function. 7887 unsigned CurrOffset = 0; 7888 LLT PtrTy = MRI.getType(Src); 7889 SmallVector<Register, 16> LoadVals; 7890 for (auto CopyTy : MemOps) { 7891 // Construct MMO for the load. 7892 auto *LoadMMO = 7893 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); 7894 7895 // Create the load. 7896 Register LoadPtr = Src; 7897 if (CurrOffset != 0) { 7898 auto Offset = 7899 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset); 7900 LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0); 7901 } 7902 LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0)); 7903 CurrOffset += CopyTy.getSizeInBytes(); 7904 } 7905 7906 CurrOffset = 0; 7907 for (unsigned I = 0; I < MemOps.size(); ++I) { 7908 LLT CopyTy = MemOps[I]; 7909 // Now store the values loaded. 7910 auto *StoreMMO = 7911 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); 7912 7913 Register StorePtr = Dst; 7914 if (CurrOffset != 0) { 7915 auto Offset = 7916 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset); 7917 StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0); 7918 } 7919 MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO); 7920 CurrOffset += CopyTy.getSizeInBytes(); 7921 } 7922 MI.eraseFromParent(); 7923 return Legalized; 7924 } 7925 7926 LegalizerHelper::LegalizeResult 7927 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { 7928 const unsigned Opc = MI.getOpcode(); 7929 // This combine is fairly complex so it's not written with a separate 7930 // matcher function. 7931 assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE || 7932 Opc == TargetOpcode::G_MEMSET) && 7933 "Expected memcpy like instruction"); 7934 7935 auto MMOIt = MI.memoperands_begin(); 7936 const MachineMemOperand *MemOp = *MMOIt; 7937 7938 Align DstAlign = MemOp->getBaseAlign(); 7939 Align SrcAlign; 7940 Register Dst = MI.getOperand(0).getReg(); 7941 Register Src = MI.getOperand(1).getReg(); 7942 Register Len = MI.getOperand(2).getReg(); 7943 7944 if (Opc != TargetOpcode::G_MEMSET) { 7945 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI"); 7946 MemOp = *(++MMOIt); 7947 SrcAlign = MemOp->getBaseAlign(); 7948 } 7949 7950 // See if this is a constant length copy 7951 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI); 7952 if (!LenVRegAndVal) 7953 return UnableToLegalize; 7954 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue(); 7955 7956 if (KnownLen == 0) { 7957 MI.eraseFromParent(); 7958 return Legalized; 7959 } 7960 7961 bool IsVolatile = MemOp->isVolatile(); 7962 if (Opc == TargetOpcode::G_MEMCPY_INLINE) 7963 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, 7964 IsVolatile); 7965 7966 // Don't try to optimize volatile. 7967 if (IsVolatile) 7968 return UnableToLegalize; 7969 7970 if (MaxLen && KnownLen > MaxLen) 7971 return UnableToLegalize; 7972 7973 if (Opc == TargetOpcode::G_MEMCPY) { 7974 auto &MF = *MI.getParent()->getParent(); 7975 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 7976 bool OptSize = shouldLowerMemFuncForSize(MF); 7977 uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize); 7978 return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign, 7979 IsVolatile); 7980 } 7981 if (Opc == TargetOpcode::G_MEMMOVE) 7982 return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); 7983 if (Opc == TargetOpcode::G_MEMSET) 7984 return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile); 7985 return UnableToLegalize; 7986 } 7987