1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64GlobalISelUtils.h" 15 #include "AArch64InstrInfo.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64RegisterBankInfo.h" 18 #include "AArch64RegisterInfo.h" 19 #include "AArch64Subtarget.h" 20 #include "AArch64TargetMachine.h" 21 #include "MCTargetDesc/AArch64AddressingModes.h" 22 #include "MCTargetDesc/AArch64MCTargetDesc.h" 23 #include "llvm/BinaryFormat/Dwarf.h" 24 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 26 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/Utils.h" 30 #include "llvm/CodeGen/MachineBasicBlock.h" 31 #include "llvm/CodeGen/MachineConstantPool.h" 32 #include "llvm/CodeGen/MachineFrameInfo.h" 33 #include "llvm/CodeGen/MachineFunction.h" 34 #include "llvm/CodeGen/MachineInstr.h" 35 #include "llvm/CodeGen/MachineInstrBuilder.h" 36 #include "llvm/CodeGen/MachineMemOperand.h" 37 #include "llvm/CodeGen/MachineOperand.h" 38 #include "llvm/CodeGen/MachineRegisterInfo.h" 39 #include "llvm/CodeGen/TargetOpcodes.h" 40 #include "llvm/CodeGen/TargetRegisterInfo.h" 41 #include "llvm/IR/Constants.h" 42 #include "llvm/IR/DerivedTypes.h" 43 #include "llvm/IR/Instructions.h" 44 #include "llvm/IR/IntrinsicsAArch64.h" 45 #include "llvm/IR/Type.h" 46 #include "llvm/Pass.h" 47 #include "llvm/Support/Debug.h" 48 #include "llvm/Support/raw_ostream.h" 49 #include <optional> 50 51 #define DEBUG_TYPE "aarch64-isel" 52 53 using namespace llvm; 54 using namespace MIPatternMatch; 55 using namespace AArch64GISelUtils; 56 57 namespace llvm { 58 class BlockFrequencyInfo; 59 class ProfileSummaryInfo; 60 } 61 62 namespace { 63 64 #define GET_GLOBALISEL_PREDICATE_BITSET 65 #include "AArch64GenGlobalISel.inc" 66 #undef GET_GLOBALISEL_PREDICATE_BITSET 67 68 69 class AArch64InstructionSelector : public InstructionSelector { 70 public: 71 AArch64InstructionSelector(const AArch64TargetMachine &TM, 72 const AArch64Subtarget &STI, 73 const AArch64RegisterBankInfo &RBI); 74 75 bool select(MachineInstr &I) override; 76 static const char *getName() { return DEBUG_TYPE; } 77 78 void setupMF(MachineFunction &MF, GISelKnownBits *KB, 79 CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, 80 BlockFrequencyInfo *BFI) override { 81 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); 82 MIB.setMF(MF); 83 84 // hasFnAttribute() is expensive to call on every BRCOND selection, so 85 // cache it here for each run of the selector. 86 ProduceNonFlagSettingCondBr = 87 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); 88 MFReturnAddr = Register(); 89 90 processPHIs(MF); 91 } 92 93 private: 94 /// tblgen-erated 'select' implementation, used as the initial selector for 95 /// the patterns that don't require complex C++. 96 bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; 97 98 // A lowering phase that runs before any selection attempts. 99 // Returns true if the instruction was modified. 100 bool preISelLower(MachineInstr &I); 101 102 // An early selection function that runs before the selectImpl() call. 103 bool earlySelect(MachineInstr &I); 104 105 /// Save state that is shared between select calls, call select on \p I and 106 /// then restore the saved state. This can be used to recursively call select 107 /// within a select call. 108 bool selectAndRestoreState(MachineInstr &I); 109 110 // Do some preprocessing of G_PHIs before we begin selection. 111 void processPHIs(MachineFunction &MF); 112 113 bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI); 114 115 /// Eliminate same-sized cross-bank copies into stores before selectImpl(). 116 bool contractCrossBankCopyIntoStore(MachineInstr &I, 117 MachineRegisterInfo &MRI); 118 119 bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI); 120 121 bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, 122 MachineRegisterInfo &MRI) const; 123 bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, 124 MachineRegisterInfo &MRI) const; 125 126 ///@{ 127 /// Helper functions for selectCompareBranch. 128 bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp, 129 MachineIRBuilder &MIB) const; 130 bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, 131 MachineIRBuilder &MIB) const; 132 bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, 133 MachineIRBuilder &MIB) const; 134 bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert, 135 MachineBasicBlock *DstMBB, 136 MachineIRBuilder &MIB) const; 137 ///@} 138 139 bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, 140 MachineRegisterInfo &MRI); 141 142 bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI); 143 bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI); 144 145 // Helper to generate an equivalent of scalar_to_vector into a new register, 146 // returned via 'Dst'. 147 MachineInstr *emitScalarToVector(unsigned EltSize, 148 const TargetRegisterClass *DstRC, 149 Register Scalar, 150 MachineIRBuilder &MIRBuilder) const; 151 /// Helper to narrow vector that was widened by emitScalarToVector. 152 /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit 153 /// vector, correspondingly. 154 MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg, 155 MachineIRBuilder &MIRBuilder, 156 MachineRegisterInfo &MRI) const; 157 158 /// Emit a lane insert into \p DstReg, or a new vector register if 159 /// std::nullopt is provided. 160 /// 161 /// The lane inserted into is defined by \p LaneIdx. The vector source 162 /// register is given by \p SrcReg. The register containing the element is 163 /// given by \p EltReg. 164 MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg, 165 Register EltReg, unsigned LaneIdx, 166 const RegisterBank &RB, 167 MachineIRBuilder &MIRBuilder) const; 168 169 /// Emit a sequence of instructions representing a constant \p CV for a 170 /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.) 171 /// 172 /// \returns the last instruction in the sequence on success, and nullptr 173 /// otherwise. 174 MachineInstr *emitConstantVector(Register Dst, Constant *CV, 175 MachineIRBuilder &MIRBuilder, 176 MachineRegisterInfo &MRI); 177 178 MachineInstr *tryAdvSIMDModImm8(Register Dst, unsigned DstSize, APInt Bits, 179 MachineIRBuilder &MIRBuilder); 180 181 MachineInstr *tryAdvSIMDModImm16(Register Dst, unsigned DstSize, APInt Bits, 182 MachineIRBuilder &MIRBuilder, bool Inv); 183 184 MachineInstr *tryAdvSIMDModImm32(Register Dst, unsigned DstSize, APInt Bits, 185 MachineIRBuilder &MIRBuilder, bool Inv); 186 MachineInstr *tryAdvSIMDModImm64(Register Dst, unsigned DstSize, APInt Bits, 187 MachineIRBuilder &MIRBuilder); 188 MachineInstr *tryAdvSIMDModImm321s(Register Dst, unsigned DstSize, APInt Bits, 189 MachineIRBuilder &MIRBuilder, bool Inv); 190 MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits, 191 MachineIRBuilder &MIRBuilder); 192 193 bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy, 194 MachineRegisterInfo &MRI); 195 /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a 196 /// SUBREG_TO_REG. 197 bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI); 198 bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI); 199 bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI); 200 bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI); 201 202 bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI); 203 bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI); 204 bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI); 205 bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI); 206 207 /// Helper function to select vector load intrinsics like 208 /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc. 209 /// \p Opc is the opcode that the selected instruction should use. 210 /// \p NumVecs is the number of vector destinations for the instruction. 211 /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction. 212 bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs, 213 MachineInstr &I); 214 bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs, 215 MachineInstr &I); 216 void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs, 217 unsigned Opc); 218 bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs, 219 unsigned Opc); 220 bool selectIntrinsicWithSideEffects(MachineInstr &I, 221 MachineRegisterInfo &MRI); 222 bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI); 223 bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI); 224 bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI); 225 bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI); 226 bool selectPtrAuthGlobalValue(MachineInstr &I, 227 MachineRegisterInfo &MRI) const; 228 bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI); 229 bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI); 230 bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI); 231 void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs, 232 unsigned Opc1, unsigned Opc2, bool isExt); 233 234 bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI); 235 bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI); 236 bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI); 237 238 unsigned emitConstantPoolEntry(const Constant *CPVal, 239 MachineFunction &MF) const; 240 MachineInstr *emitLoadFromConstantPool(const Constant *CPVal, 241 MachineIRBuilder &MIRBuilder) const; 242 243 // Emit a vector concat operation. 244 MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1, 245 Register Op2, 246 MachineIRBuilder &MIRBuilder) const; 247 248 // Emit an integer compare between LHS and RHS, which checks for Predicate. 249 MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, 250 MachineOperand &Predicate, 251 MachineIRBuilder &MIRBuilder) const; 252 253 /// Emit a floating point comparison between \p LHS and \p RHS. 254 /// \p Pred if given is the intended predicate to use. 255 MachineInstr * 256 emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder, 257 std::optional<CmpInst::Predicate> = std::nullopt) const; 258 259 MachineInstr * 260 emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps, 261 std::initializer_list<llvm::SrcOp> SrcOps, 262 MachineIRBuilder &MIRBuilder, 263 const ComplexRendererFns &RenderFns = std::nullopt) const; 264 /// Helper function to emit an add or sub instruction. 265 /// 266 /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above 267 /// in a specific order. 268 /// 269 /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode. 270 /// 271 /// \code 272 /// const std::array<std::array<unsigned, 2>, 4> Table { 273 /// {{AArch64::ADDXri, AArch64::ADDWri}, 274 /// {AArch64::ADDXrs, AArch64::ADDWrs}, 275 /// {AArch64::ADDXrr, AArch64::ADDWrr}, 276 /// {AArch64::SUBXri, AArch64::SUBWri}, 277 /// {AArch64::ADDXrx, AArch64::ADDWrx}}}; 278 /// \endcode 279 /// 280 /// Each row in the table corresponds to a different addressing mode. Each 281 /// column corresponds to a different register size. 282 /// 283 /// \attention Rows must be structured as follows: 284 /// - Row 0: The ri opcode variants 285 /// - Row 1: The rs opcode variants 286 /// - Row 2: The rr opcode variants 287 /// - Row 3: The ri opcode variants for negative immediates 288 /// - Row 4: The rx opcode variants 289 /// 290 /// \attention Columns must be structured as follows: 291 /// - Column 0: The 64-bit opcode variants 292 /// - Column 1: The 32-bit opcode variants 293 /// 294 /// \p Dst is the destination register of the binop to emit. 295 /// \p LHS is the left-hand operand of the binop to emit. 296 /// \p RHS is the right-hand operand of the binop to emit. 297 MachineInstr *emitAddSub( 298 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, 299 Register Dst, MachineOperand &LHS, MachineOperand &RHS, 300 MachineIRBuilder &MIRBuilder) const; 301 MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, 302 MachineOperand &RHS, 303 MachineIRBuilder &MIRBuilder) const; 304 MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 305 MachineIRBuilder &MIRBuilder) const; 306 MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 307 MachineIRBuilder &MIRBuilder) const; 308 MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 309 MachineIRBuilder &MIRBuilder) const; 310 MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, 311 MachineIRBuilder &MIRBuilder) const; 312 MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, 313 MachineIRBuilder &MIRBuilder) const; 314 MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS, 315 MachineIRBuilder &MIRBuilder) const; 316 MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS, 317 AArch64CC::CondCode CC, 318 MachineIRBuilder &MIRBuilder) const; 319 MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg, 320 const RegisterBank &DstRB, LLT ScalarTy, 321 Register VecReg, unsigned LaneIdx, 322 MachineIRBuilder &MIRBuilder) const; 323 MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2, 324 AArch64CC::CondCode Pred, 325 MachineIRBuilder &MIRBuilder) const; 326 /// Emit a CSet for a FP compare. 327 /// 328 /// \p Dst is expected to be a 32-bit scalar register. 329 MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred, 330 MachineIRBuilder &MIRBuilder) const; 331 332 /// Emit an instruction that sets NZCV to the carry-in expected by \p I. 333 /// Might elide the instruction if the previous instruction already sets NZCV 334 /// correctly. 335 MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg); 336 337 /// Emit the overflow op for \p Opcode. 338 /// 339 /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO, 340 /// G_USUBO, etc. 341 std::pair<MachineInstr *, AArch64CC::CondCode> 342 emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS, 343 MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; 344 345 bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI); 346 347 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). 348 /// In some cases this is even possible with OR operations in the expression. 349 MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC, 350 MachineIRBuilder &MIB) const; 351 MachineInstr *emitConditionalComparison(Register LHS, Register RHS, 352 CmpInst::Predicate CC, 353 AArch64CC::CondCode Predicate, 354 AArch64CC::CondCode OutCC, 355 MachineIRBuilder &MIB) const; 356 MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC, 357 bool Negate, Register CCOp, 358 AArch64CC::CondCode Predicate, 359 MachineIRBuilder &MIB) const; 360 361 /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg. 362 /// \p IsNegative is true if the test should be "not zero". 363 /// This will also optimize the test bit instruction when possible. 364 MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative, 365 MachineBasicBlock *DstMBB, 366 MachineIRBuilder &MIB) const; 367 368 /// Emit a CB(N)Z instruction which branches to \p DestMBB. 369 MachineInstr *emitCBZ(Register CompareReg, bool IsNegative, 370 MachineBasicBlock *DestMBB, 371 MachineIRBuilder &MIB) const; 372 373 // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. 374 // We use these manually instead of using the importer since it doesn't 375 // support SDNodeXForm. 376 ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const; 377 ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const; 378 ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const; 379 ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const; 380 381 ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const; 382 ComplexRendererFns selectArithImmed(MachineOperand &Root) const; 383 ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const; 384 385 ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, 386 unsigned Size) const; 387 388 ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const { 389 return selectAddrModeUnscaled(Root, 1); 390 } 391 ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const { 392 return selectAddrModeUnscaled(Root, 2); 393 } 394 ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const { 395 return selectAddrModeUnscaled(Root, 4); 396 } 397 ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const { 398 return selectAddrModeUnscaled(Root, 8); 399 } 400 ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const { 401 return selectAddrModeUnscaled(Root, 16); 402 } 403 404 /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used 405 /// from complex pattern matchers like selectAddrModeIndexed(). 406 ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size, 407 MachineRegisterInfo &MRI) const; 408 409 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root, 410 unsigned Size) const; 411 template <int Width> 412 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const { 413 return selectAddrModeIndexed(Root, Width / 8); 414 } 415 416 std::optional<bool> 417 isWorthFoldingIntoAddrMode(MachineInstr &MI, 418 const MachineRegisterInfo &MRI) const; 419 420 bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, 421 const MachineRegisterInfo &MRI, 422 bool IsAddrOperand) const; 423 ComplexRendererFns 424 selectAddrModeShiftedExtendXReg(MachineOperand &Root, 425 unsigned SizeInBytes) const; 426 427 /// Returns a \p ComplexRendererFns which contains a base, offset, and whether 428 /// or not a shift + extend should be folded into an addressing mode. Returns 429 /// None when this is not profitable or possible. 430 ComplexRendererFns 431 selectExtendedSHL(MachineOperand &Root, MachineOperand &Base, 432 MachineOperand &Offset, unsigned SizeInBytes, 433 bool WantsExt) const; 434 ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; 435 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, 436 unsigned SizeInBytes) const; 437 template <int Width> 438 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const { 439 return selectAddrModeXRO(Root, Width / 8); 440 } 441 442 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root, 443 unsigned SizeInBytes) const; 444 template <int Width> 445 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const { 446 return selectAddrModeWRO(Root, Width / 8); 447 } 448 449 ComplexRendererFns selectShiftedRegister(MachineOperand &Root, 450 bool AllowROR = false) const; 451 452 ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const { 453 return selectShiftedRegister(Root); 454 } 455 456 ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const { 457 return selectShiftedRegister(Root, true); 458 } 459 460 /// Given an extend instruction, determine the correct shift-extend type for 461 /// that instruction. 462 /// 463 /// If the instruction is going to be used in a load or store, pass 464 /// \p IsLoadStore = true. 465 AArch64_AM::ShiftExtendType 466 getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI, 467 bool IsLoadStore = false) const; 468 469 /// Move \p Reg to \p RC if \p Reg is not already on \p RC. 470 /// 471 /// \returns Either \p Reg if no change was necessary, or the new register 472 /// created by moving \p Reg. 473 /// 474 /// Note: This uses emitCopy right now. 475 Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC, 476 MachineIRBuilder &MIB) const; 477 478 ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; 479 480 ComplexRendererFns selectExtractHigh(MachineOperand &Root) const; 481 482 void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, 483 int OpIdx = -1) const; 484 void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I, 485 int OpIdx = -1) const; 486 void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I, 487 int OpIdx = -1) const; 488 void renderUbsanTrap(MachineInstrBuilder &MIB, const MachineInstr &MI, 489 int OpIdx) const; 490 void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI, 491 int OpIdx = -1) const; 492 void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, 493 int OpIdx = -1) const; 494 void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI, 495 int OpIdx = -1) const; 496 void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB, 497 const MachineInstr &MI, 498 int OpIdx = -1) const; 499 500 // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. 501 void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags); 502 503 // Optimization methods. 504 bool tryOptSelect(GSelect &Sel); 505 bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI); 506 MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, 507 MachineOperand &Predicate, 508 MachineIRBuilder &MIRBuilder) const; 509 510 /// Return true if \p MI is a load or store of \p NumBytes bytes. 511 bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const; 512 513 /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit 514 /// register zeroed out. In other words, the result of MI has been explicitly 515 /// zero extended. 516 bool isDef32(const MachineInstr &MI) const; 517 518 const AArch64TargetMachine &TM; 519 const AArch64Subtarget &STI; 520 const AArch64InstrInfo &TII; 521 const AArch64RegisterInfo &TRI; 522 const AArch64RegisterBankInfo &RBI; 523 524 bool ProduceNonFlagSettingCondBr = false; 525 526 // Some cached values used during selection. 527 // We use LR as a live-in register, and we keep track of it here as it can be 528 // clobbered by calls. 529 Register MFReturnAddr; 530 531 MachineIRBuilder MIB; 532 533 #define GET_GLOBALISEL_PREDICATES_DECL 534 #include "AArch64GenGlobalISel.inc" 535 #undef GET_GLOBALISEL_PREDICATES_DECL 536 537 // We declare the temporaries used by selectImpl() in the class to minimize the 538 // cost of constructing placeholder values. 539 #define GET_GLOBALISEL_TEMPORARIES_DECL 540 #include "AArch64GenGlobalISel.inc" 541 #undef GET_GLOBALISEL_TEMPORARIES_DECL 542 }; 543 544 } // end anonymous namespace 545 546 #define GET_GLOBALISEL_IMPL 547 #include "AArch64GenGlobalISel.inc" 548 #undef GET_GLOBALISEL_IMPL 549 550 AArch64InstructionSelector::AArch64InstructionSelector( 551 const AArch64TargetMachine &TM, const AArch64Subtarget &STI, 552 const AArch64RegisterBankInfo &RBI) 553 : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), 554 RBI(RBI), 555 #define GET_GLOBALISEL_PREDICATES_INIT 556 #include "AArch64GenGlobalISel.inc" 557 #undef GET_GLOBALISEL_PREDICATES_INIT 558 #define GET_GLOBALISEL_TEMPORARIES_INIT 559 #include "AArch64GenGlobalISel.inc" 560 #undef GET_GLOBALISEL_TEMPORARIES_INIT 561 { 562 } 563 564 // FIXME: This should be target-independent, inferred from the types declared 565 // for each class in the bank. 566 // 567 /// Given a register bank, and a type, return the smallest register class that 568 /// can represent that combination. 569 static const TargetRegisterClass * 570 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, 571 bool GetAllRegSet = false) { 572 if (RB.getID() == AArch64::GPRRegBankID) { 573 if (Ty.getSizeInBits() <= 32) 574 return GetAllRegSet ? &AArch64::GPR32allRegClass 575 : &AArch64::GPR32RegClass; 576 if (Ty.getSizeInBits() == 64) 577 return GetAllRegSet ? &AArch64::GPR64allRegClass 578 : &AArch64::GPR64RegClass; 579 if (Ty.getSizeInBits() == 128) 580 return &AArch64::XSeqPairsClassRegClass; 581 return nullptr; 582 } 583 584 if (RB.getID() == AArch64::FPRRegBankID) { 585 switch (Ty.getSizeInBits()) { 586 case 8: 587 return &AArch64::FPR8RegClass; 588 case 16: 589 return &AArch64::FPR16RegClass; 590 case 32: 591 return &AArch64::FPR32RegClass; 592 case 64: 593 return &AArch64::FPR64RegClass; 594 case 128: 595 return &AArch64::FPR128RegClass; 596 } 597 return nullptr; 598 } 599 600 return nullptr; 601 } 602 603 /// Given a register bank, and size in bits, return the smallest register class 604 /// that can represent that combination. 605 static const TargetRegisterClass * 606 getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits, 607 bool GetAllRegSet = false) { 608 if (SizeInBits.isScalable()) { 609 assert(RB.getID() == AArch64::FPRRegBankID && 610 "Expected FPR regbank for scalable type size"); 611 return &AArch64::ZPRRegClass; 612 } 613 614 unsigned RegBankID = RB.getID(); 615 616 if (RegBankID == AArch64::GPRRegBankID) { 617 assert(!SizeInBits.isScalable() && "Unexpected scalable register size"); 618 if (SizeInBits <= 32) 619 return GetAllRegSet ? &AArch64::GPR32allRegClass 620 : &AArch64::GPR32RegClass; 621 if (SizeInBits == 64) 622 return GetAllRegSet ? &AArch64::GPR64allRegClass 623 : &AArch64::GPR64RegClass; 624 if (SizeInBits == 128) 625 return &AArch64::XSeqPairsClassRegClass; 626 } 627 628 if (RegBankID == AArch64::FPRRegBankID) { 629 if (SizeInBits.isScalable()) { 630 assert(SizeInBits == TypeSize::getScalable(128) && 631 "Unexpected scalable register size"); 632 return &AArch64::ZPRRegClass; 633 } 634 635 switch (SizeInBits) { 636 default: 637 return nullptr; 638 case 8: 639 return &AArch64::FPR8RegClass; 640 case 16: 641 return &AArch64::FPR16RegClass; 642 case 32: 643 return &AArch64::FPR32RegClass; 644 case 64: 645 return &AArch64::FPR64RegClass; 646 case 128: 647 return &AArch64::FPR128RegClass; 648 } 649 } 650 651 return nullptr; 652 } 653 654 /// Returns the correct subregister to use for a given register class. 655 static bool getSubRegForClass(const TargetRegisterClass *RC, 656 const TargetRegisterInfo &TRI, unsigned &SubReg) { 657 switch (TRI.getRegSizeInBits(*RC)) { 658 case 8: 659 SubReg = AArch64::bsub; 660 break; 661 case 16: 662 SubReg = AArch64::hsub; 663 break; 664 case 32: 665 if (RC != &AArch64::FPR32RegClass) 666 SubReg = AArch64::sub_32; 667 else 668 SubReg = AArch64::ssub; 669 break; 670 case 64: 671 SubReg = AArch64::dsub; 672 break; 673 default: 674 LLVM_DEBUG( 675 dbgs() << "Couldn't find appropriate subregister for register class."); 676 return false; 677 } 678 679 return true; 680 } 681 682 /// Returns the minimum size the given register bank can hold. 683 static unsigned getMinSizeForRegBank(const RegisterBank &RB) { 684 switch (RB.getID()) { 685 case AArch64::GPRRegBankID: 686 return 32; 687 case AArch64::FPRRegBankID: 688 return 8; 689 default: 690 llvm_unreachable("Tried to get minimum size for unknown register bank."); 691 } 692 } 693 694 /// Create a REG_SEQUENCE instruction using the registers in \p Regs. 695 /// Helper function for functions like createDTuple and createQTuple. 696 /// 697 /// \p RegClassIDs - The list of register class IDs available for some tuple of 698 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is 699 /// expected to contain between 2 and 4 tuple classes. 700 /// 701 /// \p SubRegs - The list of subregister classes associated with each register 702 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0 703 /// subregister class. The index of each subregister class is expected to 704 /// correspond with the index of each register class. 705 /// 706 /// \returns Either the destination register of REG_SEQUENCE instruction that 707 /// was created, or the 0th element of \p Regs if \p Regs contains a single 708 /// element. 709 static Register createTuple(ArrayRef<Register> Regs, 710 const unsigned RegClassIDs[], 711 const unsigned SubRegs[], MachineIRBuilder &MIB) { 712 unsigned NumRegs = Regs.size(); 713 if (NumRegs == 1) 714 return Regs[0]; 715 assert(NumRegs >= 2 && NumRegs <= 4 && 716 "Only support between two and 4 registers in a tuple!"); 717 const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo(); 718 auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]); 719 auto RegSequence = 720 MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {}); 721 for (unsigned I = 0, E = Regs.size(); I < E; ++I) { 722 RegSequence.addUse(Regs[I]); 723 RegSequence.addImm(SubRegs[I]); 724 } 725 return RegSequence.getReg(0); 726 } 727 728 /// Create a tuple of D-registers using the registers in \p Regs. 729 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { 730 static const unsigned RegClassIDs[] = { 731 AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID}; 732 static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1, 733 AArch64::dsub2, AArch64::dsub3}; 734 return createTuple(Regs, RegClassIDs, SubRegs, MIB); 735 } 736 737 /// Create a tuple of Q-registers using the registers in \p Regs. 738 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) { 739 static const unsigned RegClassIDs[] = { 740 AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID}; 741 static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1, 742 AArch64::qsub2, AArch64::qsub3}; 743 return createTuple(Regs, RegClassIDs, SubRegs, MIB); 744 } 745 746 static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) { 747 auto &MI = *Root.getParent(); 748 auto &MBB = *MI.getParent(); 749 auto &MF = *MBB.getParent(); 750 auto &MRI = MF.getRegInfo(); 751 uint64_t Immed; 752 if (Root.isImm()) 753 Immed = Root.getImm(); 754 else if (Root.isCImm()) 755 Immed = Root.getCImm()->getZExtValue(); 756 else if (Root.isReg()) { 757 auto ValAndVReg = 758 getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true); 759 if (!ValAndVReg) 760 return std::nullopt; 761 Immed = ValAndVReg->Value.getSExtValue(); 762 } else 763 return std::nullopt; 764 return Immed; 765 } 766 767 /// Check whether \p I is a currently unsupported binary operation: 768 /// - it has an unsized type 769 /// - an operand is not a vreg 770 /// - all operands are not in the same bank 771 /// These are checks that should someday live in the verifier, but right now, 772 /// these are mostly limitations of the aarch64 selector. 773 static bool unsupportedBinOp(const MachineInstr &I, 774 const AArch64RegisterBankInfo &RBI, 775 const MachineRegisterInfo &MRI, 776 const AArch64RegisterInfo &TRI) { 777 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 778 if (!Ty.isValid()) { 779 LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n"); 780 return true; 781 } 782 783 const RegisterBank *PrevOpBank = nullptr; 784 for (auto &MO : I.operands()) { 785 // FIXME: Support non-register operands. 786 if (!MO.isReg()) { 787 LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n"); 788 return true; 789 } 790 791 // FIXME: Can generic operations have physical registers operands? If 792 // so, this will need to be taught about that, and we'll need to get the 793 // bank out of the minimal class for the register. 794 // Either way, this needs to be documented (and possibly verified). 795 if (!MO.getReg().isVirtual()) { 796 LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n"); 797 return true; 798 } 799 800 const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI); 801 if (!OpBank) { 802 LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n"); 803 return true; 804 } 805 806 if (PrevOpBank && OpBank != PrevOpBank) { 807 LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n"); 808 return true; 809 } 810 PrevOpBank = OpBank; 811 } 812 return false; 813 } 814 815 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc 816 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID 817 /// and of size \p OpSize. 818 /// \returns \p GenericOpc if the combination is unsupported. 819 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, 820 unsigned OpSize) { 821 switch (RegBankID) { 822 case AArch64::GPRRegBankID: 823 if (OpSize == 32) { 824 switch (GenericOpc) { 825 case TargetOpcode::G_SHL: 826 return AArch64::LSLVWr; 827 case TargetOpcode::G_LSHR: 828 return AArch64::LSRVWr; 829 case TargetOpcode::G_ASHR: 830 return AArch64::ASRVWr; 831 default: 832 return GenericOpc; 833 } 834 } else if (OpSize == 64) { 835 switch (GenericOpc) { 836 case TargetOpcode::G_PTR_ADD: 837 return AArch64::ADDXrr; 838 case TargetOpcode::G_SHL: 839 return AArch64::LSLVXr; 840 case TargetOpcode::G_LSHR: 841 return AArch64::LSRVXr; 842 case TargetOpcode::G_ASHR: 843 return AArch64::ASRVXr; 844 default: 845 return GenericOpc; 846 } 847 } 848 break; 849 case AArch64::FPRRegBankID: 850 switch (OpSize) { 851 case 32: 852 switch (GenericOpc) { 853 case TargetOpcode::G_FADD: 854 return AArch64::FADDSrr; 855 case TargetOpcode::G_FSUB: 856 return AArch64::FSUBSrr; 857 case TargetOpcode::G_FMUL: 858 return AArch64::FMULSrr; 859 case TargetOpcode::G_FDIV: 860 return AArch64::FDIVSrr; 861 default: 862 return GenericOpc; 863 } 864 case 64: 865 switch (GenericOpc) { 866 case TargetOpcode::G_FADD: 867 return AArch64::FADDDrr; 868 case TargetOpcode::G_FSUB: 869 return AArch64::FSUBDrr; 870 case TargetOpcode::G_FMUL: 871 return AArch64::FMULDrr; 872 case TargetOpcode::G_FDIV: 873 return AArch64::FDIVDrr; 874 case TargetOpcode::G_OR: 875 return AArch64::ORRv8i8; 876 default: 877 return GenericOpc; 878 } 879 } 880 break; 881 } 882 return GenericOpc; 883 } 884 885 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc, 886 /// appropriate for the (value) register bank \p RegBankID and of memory access 887 /// size \p OpSize. This returns the variant with the base+unsigned-immediate 888 /// addressing mode (e.g., LDRXui). 889 /// \returns \p GenericOpc if the combination is unsupported. 890 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, 891 unsigned OpSize) { 892 const bool isStore = GenericOpc == TargetOpcode::G_STORE; 893 switch (RegBankID) { 894 case AArch64::GPRRegBankID: 895 switch (OpSize) { 896 case 8: 897 return isStore ? AArch64::STRBBui : AArch64::LDRBBui; 898 case 16: 899 return isStore ? AArch64::STRHHui : AArch64::LDRHHui; 900 case 32: 901 return isStore ? AArch64::STRWui : AArch64::LDRWui; 902 case 64: 903 return isStore ? AArch64::STRXui : AArch64::LDRXui; 904 } 905 break; 906 case AArch64::FPRRegBankID: 907 switch (OpSize) { 908 case 8: 909 return isStore ? AArch64::STRBui : AArch64::LDRBui; 910 case 16: 911 return isStore ? AArch64::STRHui : AArch64::LDRHui; 912 case 32: 913 return isStore ? AArch64::STRSui : AArch64::LDRSui; 914 case 64: 915 return isStore ? AArch64::STRDui : AArch64::LDRDui; 916 case 128: 917 return isStore ? AArch64::STRQui : AArch64::LDRQui; 918 } 919 break; 920 } 921 return GenericOpc; 922 } 923 924 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg 925 /// to \p *To. 926 /// 927 /// E.g "To = COPY SrcReg:SubReg" 928 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI, 929 const RegisterBankInfo &RBI, Register SrcReg, 930 const TargetRegisterClass *To, unsigned SubReg) { 931 assert(SrcReg.isValid() && "Expected a valid source register?"); 932 assert(To && "Destination register class cannot be null"); 933 assert(SubReg && "Expected a valid subregister"); 934 935 MachineIRBuilder MIB(I); 936 auto SubRegCopy = 937 MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg); 938 MachineOperand &RegOp = I.getOperand(1); 939 RegOp.setReg(SubRegCopy.getReg(0)); 940 941 // It's possible that the destination register won't be constrained. Make 942 // sure that happens. 943 if (!I.getOperand(0).getReg().isPhysical()) 944 RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); 945 946 return true; 947 } 948 949 /// Helper function to get the source and destination register classes for a 950 /// copy. Returns a std::pair containing the source register class for the 951 /// copy, and the destination register class for the copy. If a register class 952 /// cannot be determined, then it will be nullptr. 953 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 954 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, 955 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, 956 const RegisterBankInfo &RBI) { 957 Register DstReg = I.getOperand(0).getReg(); 958 Register SrcReg = I.getOperand(1).getReg(); 959 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); 960 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); 961 962 TypeSize DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); 963 TypeSize SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); 964 965 // Special casing for cross-bank copies of s1s. We can technically represent 966 // a 1-bit value with any size of register. The minimum size for a GPR is 32 967 // bits. So, we need to put the FPR on 32 bits as well. 968 // 969 // FIXME: I'm not sure if this case holds true outside of copies. If it does, 970 // then we can pull it into the helpers that get the appropriate class for a 971 // register bank. Or make a new helper that carries along some constraint 972 // information. 973 if (SrcRegBank != DstRegBank && 974 (DstSize == TypeSize::getFixed(1) && SrcSize == TypeSize::getFixed(1))) 975 SrcSize = DstSize = TypeSize::getFixed(32); 976 977 return {getMinClassForRegBank(SrcRegBank, SrcSize, true), 978 getMinClassForRegBank(DstRegBank, DstSize, true)}; 979 } 980 981 // FIXME: We need some sort of API in RBI/TRI to allow generic code to 982 // constrain operands of simple instructions given a TargetRegisterClass 983 // and LLT 984 static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI, 985 const RegisterBankInfo &RBI) { 986 for (MachineOperand &MO : I.operands()) { 987 if (!MO.isReg()) 988 continue; 989 Register Reg = MO.getReg(); 990 if (!Reg) 991 continue; 992 if (Reg.isPhysical()) 993 continue; 994 LLT Ty = MRI.getType(Reg); 995 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 996 const TargetRegisterClass *RC = 997 dyn_cast<const TargetRegisterClass *>(RegClassOrBank); 998 if (!RC) { 999 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank); 1000 RC = getRegClassForTypeOnBank(Ty, RB); 1001 if (!RC) { 1002 LLVM_DEBUG( 1003 dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n"); 1004 break; 1005 } 1006 } 1007 RBI.constrainGenericRegister(Reg, *RC, MRI); 1008 } 1009 1010 return true; 1011 } 1012 1013 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, 1014 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, 1015 const RegisterBankInfo &RBI) { 1016 Register DstReg = I.getOperand(0).getReg(); 1017 Register SrcReg = I.getOperand(1).getReg(); 1018 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); 1019 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); 1020 1021 // Find the correct register classes for the source and destination registers. 1022 const TargetRegisterClass *SrcRC; 1023 const TargetRegisterClass *DstRC; 1024 std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI); 1025 1026 if (!DstRC) { 1027 LLVM_DEBUG(dbgs() << "Unexpected dest size " 1028 << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n'); 1029 return false; 1030 } 1031 1032 // Is this a copy? If so, then we may need to insert a subregister copy. 1033 if (I.isCopy()) { 1034 // Yes. Check if there's anything to fix up. 1035 if (!SrcRC) { 1036 LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n"); 1037 return false; 1038 } 1039 1040 const TypeSize SrcSize = TRI.getRegSizeInBits(*SrcRC); 1041 const TypeSize DstSize = TRI.getRegSizeInBits(*DstRC); 1042 unsigned SubReg; 1043 1044 // If the source bank doesn't support a subregister copy small enough, 1045 // then we first need to copy to the destination bank. 1046 if (getMinSizeForRegBank(SrcRegBank) > DstSize) { 1047 const TargetRegisterClass *DstTempRC = 1048 getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true); 1049 getSubRegForClass(DstRC, TRI, SubReg); 1050 1051 MachineIRBuilder MIB(I); 1052 auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg}); 1053 copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg); 1054 } else if (SrcSize > DstSize) { 1055 // If the source register is bigger than the destination we need to 1056 // perform a subregister copy. 1057 const TargetRegisterClass *SubRegRC = 1058 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); 1059 getSubRegForClass(SubRegRC, TRI, SubReg); 1060 copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg); 1061 } else if (DstSize > SrcSize) { 1062 // If the destination register is bigger than the source we need to do 1063 // a promotion using SUBREG_TO_REG. 1064 const TargetRegisterClass *PromotionRC = 1065 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true); 1066 getSubRegForClass(SrcRC, TRI, SubReg); 1067 1068 Register PromoteReg = MRI.createVirtualRegister(PromotionRC); 1069 BuildMI(*I.getParent(), I, I.getDebugLoc(), 1070 TII.get(AArch64::SUBREG_TO_REG), PromoteReg) 1071 .addImm(0) 1072 .addUse(SrcReg) 1073 .addImm(SubReg); 1074 MachineOperand &RegOp = I.getOperand(1); 1075 RegOp.setReg(PromoteReg); 1076 } 1077 1078 // If the destination is a physical register, then there's nothing to 1079 // change, so we're done. 1080 if (DstReg.isPhysical()) 1081 return true; 1082 } 1083 1084 // No need to constrain SrcReg. It will get constrained when we hit another 1085 // of its use or its defs. Copies do not have constraints. 1086 if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 1087 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) 1088 << " operand\n"); 1089 return false; 1090 } 1091 1092 // If this a GPR ZEXT that we want to just reduce down into a copy. 1093 // The sizes will be mismatched with the source < 32b but that's ok. 1094 if (I.getOpcode() == TargetOpcode::G_ZEXT) { 1095 I.setDesc(TII.get(AArch64::COPY)); 1096 assert(SrcRegBank.getID() == AArch64::GPRRegBankID); 1097 return selectCopy(I, TII, MRI, TRI, RBI); 1098 } 1099 1100 I.setDesc(TII.get(AArch64::COPY)); 1101 return true; 1102 } 1103 1104 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { 1105 if (!DstTy.isScalar() || !SrcTy.isScalar()) 1106 return GenericOpc; 1107 1108 const unsigned DstSize = DstTy.getSizeInBits(); 1109 const unsigned SrcSize = SrcTy.getSizeInBits(); 1110 1111 switch (DstSize) { 1112 case 32: 1113 switch (SrcSize) { 1114 case 32: 1115 switch (GenericOpc) { 1116 case TargetOpcode::G_SITOFP: 1117 return AArch64::SCVTFUWSri; 1118 case TargetOpcode::G_UITOFP: 1119 return AArch64::UCVTFUWSri; 1120 case TargetOpcode::G_FPTOSI: 1121 return AArch64::FCVTZSUWSr; 1122 case TargetOpcode::G_FPTOUI: 1123 return AArch64::FCVTZUUWSr; 1124 default: 1125 return GenericOpc; 1126 } 1127 case 64: 1128 switch (GenericOpc) { 1129 case TargetOpcode::G_SITOFP: 1130 return AArch64::SCVTFUXSri; 1131 case TargetOpcode::G_UITOFP: 1132 return AArch64::UCVTFUXSri; 1133 case TargetOpcode::G_FPTOSI: 1134 return AArch64::FCVTZSUWDr; 1135 case TargetOpcode::G_FPTOUI: 1136 return AArch64::FCVTZUUWDr; 1137 default: 1138 return GenericOpc; 1139 } 1140 default: 1141 return GenericOpc; 1142 } 1143 case 64: 1144 switch (SrcSize) { 1145 case 32: 1146 switch (GenericOpc) { 1147 case TargetOpcode::G_SITOFP: 1148 return AArch64::SCVTFUWDri; 1149 case TargetOpcode::G_UITOFP: 1150 return AArch64::UCVTFUWDri; 1151 case TargetOpcode::G_FPTOSI: 1152 return AArch64::FCVTZSUXSr; 1153 case TargetOpcode::G_FPTOUI: 1154 return AArch64::FCVTZUUXSr; 1155 default: 1156 return GenericOpc; 1157 } 1158 case 64: 1159 switch (GenericOpc) { 1160 case TargetOpcode::G_SITOFP: 1161 return AArch64::SCVTFUXDri; 1162 case TargetOpcode::G_UITOFP: 1163 return AArch64::UCVTFUXDri; 1164 case TargetOpcode::G_FPTOSI: 1165 return AArch64::FCVTZSUXDr; 1166 case TargetOpcode::G_FPTOUI: 1167 return AArch64::FCVTZUUXDr; 1168 default: 1169 return GenericOpc; 1170 } 1171 default: 1172 return GenericOpc; 1173 } 1174 default: 1175 return GenericOpc; 1176 }; 1177 return GenericOpc; 1178 } 1179 1180 MachineInstr * 1181 AArch64InstructionSelector::emitSelect(Register Dst, Register True, 1182 Register False, AArch64CC::CondCode CC, 1183 MachineIRBuilder &MIB) const { 1184 MachineRegisterInfo &MRI = *MIB.getMRI(); 1185 assert(RBI.getRegBank(False, MRI, TRI)->getID() == 1186 RBI.getRegBank(True, MRI, TRI)->getID() && 1187 "Expected both select operands to have the same regbank?"); 1188 LLT Ty = MRI.getType(True); 1189 if (Ty.isVector()) 1190 return nullptr; 1191 const unsigned Size = Ty.getSizeInBits(); 1192 assert((Size == 32 || Size == 64) && 1193 "Expected 32 bit or 64 bit select only?"); 1194 const bool Is32Bit = Size == 32; 1195 if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) { 1196 unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr; 1197 auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); 1198 constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI); 1199 return &*FCSel; 1200 } 1201 1202 // By default, we'll try and emit a CSEL. 1203 unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr; 1204 bool Optimized = false; 1205 auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI, 1206 &Optimized](Register &Reg, Register &OtherReg, 1207 bool Invert) { 1208 if (Optimized) 1209 return false; 1210 1211 // Attempt to fold: 1212 // 1213 // %sub = G_SUB 0, %x 1214 // %select = G_SELECT cc, %reg, %sub 1215 // 1216 // Into: 1217 // %select = CSNEG %reg, %x, cc 1218 Register MatchReg; 1219 if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) { 1220 Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr; 1221 Reg = MatchReg; 1222 if (Invert) { 1223 CC = AArch64CC::getInvertedCondCode(CC); 1224 std::swap(Reg, OtherReg); 1225 } 1226 return true; 1227 } 1228 1229 // Attempt to fold: 1230 // 1231 // %xor = G_XOR %x, -1 1232 // %select = G_SELECT cc, %reg, %xor 1233 // 1234 // Into: 1235 // %select = CSINV %reg, %x, cc 1236 if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) { 1237 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1238 Reg = MatchReg; 1239 if (Invert) { 1240 CC = AArch64CC::getInvertedCondCode(CC); 1241 std::swap(Reg, OtherReg); 1242 } 1243 return true; 1244 } 1245 1246 // Attempt to fold: 1247 // 1248 // %add = G_ADD %x, 1 1249 // %select = G_SELECT cc, %reg, %add 1250 // 1251 // Into: 1252 // %select = CSINC %reg, %x, cc 1253 if (mi_match(Reg, MRI, 1254 m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)), 1255 m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) { 1256 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1257 Reg = MatchReg; 1258 if (Invert) { 1259 CC = AArch64CC::getInvertedCondCode(CC); 1260 std::swap(Reg, OtherReg); 1261 } 1262 return true; 1263 } 1264 1265 return false; 1266 }; 1267 1268 // Helper lambda which tries to use CSINC/CSINV for the instruction when its 1269 // true/false values are constants. 1270 // FIXME: All of these patterns already exist in tablegen. We should be 1271 // able to import these. 1272 auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI, 1273 &Optimized]() { 1274 if (Optimized) 1275 return false; 1276 auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI); 1277 auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI); 1278 if (!TrueCst && !FalseCst) 1279 return false; 1280 1281 Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; 1282 if (TrueCst && FalseCst) { 1283 int64_t T = TrueCst->Value.getSExtValue(); 1284 int64_t F = FalseCst->Value.getSExtValue(); 1285 1286 if (T == 0 && F == 1) { 1287 // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc 1288 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1289 True = ZReg; 1290 False = ZReg; 1291 return true; 1292 } 1293 1294 if (T == 0 && F == -1) { 1295 // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc 1296 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1297 True = ZReg; 1298 False = ZReg; 1299 return true; 1300 } 1301 } 1302 1303 if (TrueCst) { 1304 int64_t T = TrueCst->Value.getSExtValue(); 1305 if (T == 1) { 1306 // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc 1307 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1308 True = False; 1309 False = ZReg; 1310 CC = AArch64CC::getInvertedCondCode(CC); 1311 return true; 1312 } 1313 1314 if (T == -1) { 1315 // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc 1316 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1317 True = False; 1318 False = ZReg; 1319 CC = AArch64CC::getInvertedCondCode(CC); 1320 return true; 1321 } 1322 } 1323 1324 if (FalseCst) { 1325 int64_t F = FalseCst->Value.getSExtValue(); 1326 if (F == 1) { 1327 // G_SELECT cc, t, 1 -> CSINC t, zreg, cc 1328 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; 1329 False = ZReg; 1330 return true; 1331 } 1332 1333 if (F == -1) { 1334 // G_SELECT cc, t, -1 -> CSINC t, zreg, cc 1335 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; 1336 False = ZReg; 1337 return true; 1338 } 1339 } 1340 return false; 1341 }; 1342 1343 Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false); 1344 Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true); 1345 Optimized |= TryOptSelectCst(); 1346 auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); 1347 constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI); 1348 return &*SelectInst; 1349 } 1350 1351 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { 1352 switch (P) { 1353 default: 1354 llvm_unreachable("Unknown condition code!"); 1355 case CmpInst::ICMP_NE: 1356 return AArch64CC::NE; 1357 case CmpInst::ICMP_EQ: 1358 return AArch64CC::EQ; 1359 case CmpInst::ICMP_SGT: 1360 return AArch64CC::GT; 1361 case CmpInst::ICMP_SGE: 1362 return AArch64CC::GE; 1363 case CmpInst::ICMP_SLT: 1364 return AArch64CC::LT; 1365 case CmpInst::ICMP_SLE: 1366 return AArch64CC::LE; 1367 case CmpInst::ICMP_UGT: 1368 return AArch64CC::HI; 1369 case CmpInst::ICMP_UGE: 1370 return AArch64CC::HS; 1371 case CmpInst::ICMP_ULT: 1372 return AArch64CC::LO; 1373 case CmpInst::ICMP_ULE: 1374 return AArch64CC::LS; 1375 } 1376 } 1377 1378 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC. 1379 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC, 1380 AArch64CC::CondCode &CondCode, 1381 AArch64CC::CondCode &CondCode2) { 1382 CondCode2 = AArch64CC::AL; 1383 switch (CC) { 1384 default: 1385 llvm_unreachable("Unknown FP condition!"); 1386 case CmpInst::FCMP_OEQ: 1387 CondCode = AArch64CC::EQ; 1388 break; 1389 case CmpInst::FCMP_OGT: 1390 CondCode = AArch64CC::GT; 1391 break; 1392 case CmpInst::FCMP_OGE: 1393 CondCode = AArch64CC::GE; 1394 break; 1395 case CmpInst::FCMP_OLT: 1396 CondCode = AArch64CC::MI; 1397 break; 1398 case CmpInst::FCMP_OLE: 1399 CondCode = AArch64CC::LS; 1400 break; 1401 case CmpInst::FCMP_ONE: 1402 CondCode = AArch64CC::MI; 1403 CondCode2 = AArch64CC::GT; 1404 break; 1405 case CmpInst::FCMP_ORD: 1406 CondCode = AArch64CC::VC; 1407 break; 1408 case CmpInst::FCMP_UNO: 1409 CondCode = AArch64CC::VS; 1410 break; 1411 case CmpInst::FCMP_UEQ: 1412 CondCode = AArch64CC::EQ; 1413 CondCode2 = AArch64CC::VS; 1414 break; 1415 case CmpInst::FCMP_UGT: 1416 CondCode = AArch64CC::HI; 1417 break; 1418 case CmpInst::FCMP_UGE: 1419 CondCode = AArch64CC::PL; 1420 break; 1421 case CmpInst::FCMP_ULT: 1422 CondCode = AArch64CC::LT; 1423 break; 1424 case CmpInst::FCMP_ULE: 1425 CondCode = AArch64CC::LE; 1426 break; 1427 case CmpInst::FCMP_UNE: 1428 CondCode = AArch64CC::NE; 1429 break; 1430 } 1431 } 1432 1433 /// Convert an IR fp condition code to an AArch64 CC. 1434 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that 1435 /// should be AND'ed instead of OR'ed. 1436 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC, 1437 AArch64CC::CondCode &CondCode, 1438 AArch64CC::CondCode &CondCode2) { 1439 CondCode2 = AArch64CC::AL; 1440 switch (CC) { 1441 default: 1442 changeFPCCToORAArch64CC(CC, CondCode, CondCode2); 1443 assert(CondCode2 == AArch64CC::AL); 1444 break; 1445 case CmpInst::FCMP_ONE: 1446 // (a one b) 1447 // == ((a olt b) || (a ogt b)) 1448 // == ((a ord b) && (a une b)) 1449 CondCode = AArch64CC::VC; 1450 CondCode2 = AArch64CC::NE; 1451 break; 1452 case CmpInst::FCMP_UEQ: 1453 // (a ueq b) 1454 // == ((a uno b) || (a oeq b)) 1455 // == ((a ule b) && (a uge b)) 1456 CondCode = AArch64CC::PL; 1457 CondCode2 = AArch64CC::LE; 1458 break; 1459 } 1460 } 1461 1462 /// Return a register which can be used as a bit to test in a TB(N)Z. 1463 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, 1464 MachineRegisterInfo &MRI) { 1465 assert(Reg.isValid() && "Expected valid register!"); 1466 bool HasZext = false; 1467 while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) { 1468 unsigned Opc = MI->getOpcode(); 1469 1470 if (!MI->getOperand(0).isReg() || 1471 !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 1472 break; 1473 1474 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. 1475 // 1476 // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number 1477 // on the truncated x is the same as the bit number on x. 1478 if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT || 1479 Opc == TargetOpcode::G_TRUNC) { 1480 if (Opc == TargetOpcode::G_ZEXT) 1481 HasZext = true; 1482 1483 Register NextReg = MI->getOperand(1).getReg(); 1484 // Did we find something worth folding? 1485 if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg)) 1486 break; 1487 1488 // NextReg is worth folding. Keep looking. 1489 Reg = NextReg; 1490 continue; 1491 } 1492 1493 // Attempt to find a suitable operation with a constant on one side. 1494 std::optional<uint64_t> C; 1495 Register TestReg; 1496 switch (Opc) { 1497 default: 1498 break; 1499 case TargetOpcode::G_AND: 1500 case TargetOpcode::G_XOR: { 1501 TestReg = MI->getOperand(1).getReg(); 1502 Register ConstantReg = MI->getOperand(2).getReg(); 1503 auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 1504 if (!VRegAndVal) { 1505 // AND commutes, check the other side for a constant. 1506 // FIXME: Can we canonicalize the constant so that it's always on the 1507 // same side at some point earlier? 1508 std::swap(ConstantReg, TestReg); 1509 VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 1510 } 1511 if (VRegAndVal) { 1512 if (HasZext) 1513 C = VRegAndVal->Value.getZExtValue(); 1514 else 1515 C = VRegAndVal->Value.getSExtValue(); 1516 } 1517 break; 1518 } 1519 case TargetOpcode::G_ASHR: 1520 case TargetOpcode::G_LSHR: 1521 case TargetOpcode::G_SHL: { 1522 TestReg = MI->getOperand(1).getReg(); 1523 auto VRegAndVal = 1524 getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI); 1525 if (VRegAndVal) 1526 C = VRegAndVal->Value.getSExtValue(); 1527 break; 1528 } 1529 } 1530 1531 // Didn't find a constant or viable register. Bail out of the loop. 1532 if (!C || !TestReg.isValid()) 1533 break; 1534 1535 // We found a suitable instruction with a constant. Check to see if we can 1536 // walk through the instruction. 1537 Register NextReg; 1538 unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits(); 1539 switch (Opc) { 1540 default: 1541 break; 1542 case TargetOpcode::G_AND: 1543 // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set. 1544 if ((*C >> Bit) & 1) 1545 NextReg = TestReg; 1546 break; 1547 case TargetOpcode::G_SHL: 1548 // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in 1549 // the type of the register. 1550 if (*C <= Bit && (Bit - *C) < TestRegSize) { 1551 NextReg = TestReg; 1552 Bit = Bit - *C; 1553 } 1554 break; 1555 case TargetOpcode::G_ASHR: 1556 // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits 1557 // in x 1558 NextReg = TestReg; 1559 Bit = Bit + *C; 1560 if (Bit >= TestRegSize) 1561 Bit = TestRegSize - 1; 1562 break; 1563 case TargetOpcode::G_LSHR: 1564 // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x 1565 if ((Bit + *C) < TestRegSize) { 1566 NextReg = TestReg; 1567 Bit = Bit + *C; 1568 } 1569 break; 1570 case TargetOpcode::G_XOR: 1571 // We can walk through a G_XOR by inverting whether we use tbz/tbnz when 1572 // appropriate. 1573 // 1574 // e.g. If x' = xor x, c, and the b-th bit is set in c then 1575 // 1576 // tbz x', b -> tbnz x, b 1577 // 1578 // Because x' only has the b-th bit set if x does not. 1579 if ((*C >> Bit) & 1) 1580 Invert = !Invert; 1581 NextReg = TestReg; 1582 break; 1583 } 1584 1585 // Check if we found anything worth folding. 1586 if (!NextReg.isValid()) 1587 return Reg; 1588 Reg = NextReg; 1589 } 1590 1591 return Reg; 1592 } 1593 1594 MachineInstr *AArch64InstructionSelector::emitTestBit( 1595 Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB, 1596 MachineIRBuilder &MIB) const { 1597 assert(TestReg.isValid()); 1598 assert(ProduceNonFlagSettingCondBr && 1599 "Cannot emit TB(N)Z with speculation tracking!"); 1600 MachineRegisterInfo &MRI = *MIB.getMRI(); 1601 1602 // Attempt to optimize the test bit by walking over instructions. 1603 TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI); 1604 LLT Ty = MRI.getType(TestReg); 1605 unsigned Size = Ty.getSizeInBits(); 1606 assert(!Ty.isVector() && "Expected a scalar!"); 1607 assert(Bit < 64 && "Bit is too large!"); 1608 1609 // When the test register is a 64-bit register, we have to narrow to make 1610 // TBNZW work. 1611 bool UseWReg = Bit < 32; 1612 unsigned NecessarySize = UseWReg ? 32 : 64; 1613 if (Size != NecessarySize) 1614 TestReg = moveScalarRegClass( 1615 TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass, 1616 MIB); 1617 1618 static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX}, 1619 {AArch64::TBZW, AArch64::TBNZW}}; 1620 unsigned Opc = OpcTable[UseWReg][IsNegative]; 1621 auto TestBitMI = 1622 MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB); 1623 constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI); 1624 return &*TestBitMI; 1625 } 1626 1627 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( 1628 MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB, 1629 MachineIRBuilder &MIB) const { 1630 assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?"); 1631 // Given something like this: 1632 // 1633 // %x = ...Something... 1634 // %one = G_CONSTANT i64 1 1635 // %zero = G_CONSTANT i64 0 1636 // %and = G_AND %x, %one 1637 // %cmp = G_ICMP intpred(ne), %and, %zero 1638 // %cmp_trunc = G_TRUNC %cmp 1639 // G_BRCOND %cmp_trunc, %bb.3 1640 // 1641 // We want to try and fold the AND into the G_BRCOND and produce either a 1642 // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)). 1643 // 1644 // In this case, we'd get 1645 // 1646 // TBNZ %x %bb.3 1647 // 1648 1649 // Check if the AND has a constant on its RHS which we can use as a mask. 1650 // If it's a power of 2, then it's the same as checking a specific bit. 1651 // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set) 1652 auto MaybeBit = getIConstantVRegValWithLookThrough( 1653 AndInst.getOperand(2).getReg(), *MIB.getMRI()); 1654 if (!MaybeBit) 1655 return false; 1656 1657 int32_t Bit = MaybeBit->Value.exactLogBase2(); 1658 if (Bit < 0) 1659 return false; 1660 1661 Register TestReg = AndInst.getOperand(1).getReg(); 1662 1663 // Emit a TB(N)Z. 1664 emitTestBit(TestReg, Bit, Invert, DstMBB, MIB); 1665 return true; 1666 } 1667 1668 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg, 1669 bool IsNegative, 1670 MachineBasicBlock *DestMBB, 1671 MachineIRBuilder &MIB) const { 1672 assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!"); 1673 MachineRegisterInfo &MRI = *MIB.getMRI(); 1674 assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() == 1675 AArch64::GPRRegBankID && 1676 "Expected GPRs only?"); 1677 auto Ty = MRI.getType(CompareReg); 1678 unsigned Width = Ty.getSizeInBits(); 1679 assert(!Ty.isVector() && "Expected scalar only?"); 1680 assert(Width <= 64 && "Expected width to be at most 64?"); 1681 static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX}, 1682 {AArch64::CBNZW, AArch64::CBNZX}}; 1683 unsigned Opc = OpcTable[IsNegative][Width == 64]; 1684 auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB); 1685 constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI); 1686 return &*BranchMI; 1687 } 1688 1689 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp( 1690 MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const { 1691 assert(FCmp.getOpcode() == TargetOpcode::G_FCMP); 1692 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1693 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 1694 // totally clean. Some of them require two branches to implement. 1695 auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate(); 1696 emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB, 1697 Pred); 1698 AArch64CC::CondCode CC1, CC2; 1699 changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2); 1700 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1701 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB); 1702 if (CC2 != AArch64CC::AL) 1703 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB); 1704 I.eraseFromParent(); 1705 return true; 1706 } 1707 1708 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp( 1709 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { 1710 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); 1711 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1712 // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z. 1713 // 1714 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z 1715 // instructions will not be produced, as they are conditional branch 1716 // instructions that do not set flags. 1717 if (!ProduceNonFlagSettingCondBr) 1718 return false; 1719 1720 MachineRegisterInfo &MRI = *MIB.getMRI(); 1721 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1722 auto Pred = 1723 static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate()); 1724 Register LHS = ICmp.getOperand(2).getReg(); 1725 Register RHS = ICmp.getOperand(3).getReg(); 1726 1727 // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that. 1728 auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); 1729 MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); 1730 1731 // When we can emit a TB(N)Z, prefer that. 1732 // 1733 // Handle non-commutative condition codes first. 1734 // Note that we don't want to do this when we have a G_AND because it can 1735 // become a tst. The tst will make the test bit in the TB(N)Z redundant. 1736 if (VRegAndVal && !AndInst) { 1737 int64_t C = VRegAndVal->Value.getSExtValue(); 1738 1739 // When we have a greater-than comparison, we can just test if the msb is 1740 // zero. 1741 if (C == -1 && Pred == CmpInst::ICMP_SGT) { 1742 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1743 emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); 1744 I.eraseFromParent(); 1745 return true; 1746 } 1747 1748 // When we have a less than comparison, we can just test if the msb is not 1749 // zero. 1750 if (C == 0 && Pred == CmpInst::ICMP_SLT) { 1751 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1752 emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB); 1753 I.eraseFromParent(); 1754 return true; 1755 } 1756 1757 // Inversely, if we have a signed greater-than-or-equal comparison to zero, 1758 // we can test if the msb is zero. 1759 if (C == 0 && Pred == CmpInst::ICMP_SGE) { 1760 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1; 1761 emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB); 1762 I.eraseFromParent(); 1763 return true; 1764 } 1765 } 1766 1767 // Attempt to handle commutative condition codes. Right now, that's only 1768 // eq/ne. 1769 if (ICmpInst::isEquality(Pred)) { 1770 if (!VRegAndVal) { 1771 std::swap(RHS, LHS); 1772 VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI); 1773 AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); 1774 } 1775 1776 if (VRegAndVal && VRegAndVal->Value == 0) { 1777 // If there's a G_AND feeding into this branch, try to fold it away by 1778 // emitting a TB(N)Z instead. 1779 // 1780 // Note: If we have LT, then it *is* possible to fold, but it wouldn't be 1781 // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding 1782 // would be redundant. 1783 if (AndInst && 1784 tryOptAndIntoCompareBranch( 1785 *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) { 1786 I.eraseFromParent(); 1787 return true; 1788 } 1789 1790 // Otherwise, try to emit a CB(N)Z instead. 1791 auto LHSTy = MRI.getType(LHS); 1792 if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) { 1793 emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB); 1794 I.eraseFromParent(); 1795 return true; 1796 } 1797 } 1798 } 1799 1800 return false; 1801 } 1802 1803 bool AArch64InstructionSelector::selectCompareBranchFedByICmp( 1804 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { 1805 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); 1806 assert(I.getOpcode() == TargetOpcode::G_BRCOND); 1807 if (tryOptCompareBranchFedByICmp(I, ICmp, MIB)) 1808 return true; 1809 1810 // Couldn't optimize. Emit a compare + a Bcc. 1811 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); 1812 auto PredOp = ICmp.getOperand(1); 1813 emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB); 1814 const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( 1815 static_cast<CmpInst::Predicate>(PredOp.getPredicate())); 1816 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); 1817 I.eraseFromParent(); 1818 return true; 1819 } 1820 1821 bool AArch64InstructionSelector::selectCompareBranch( 1822 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) { 1823 Register CondReg = I.getOperand(0).getReg(); 1824 MachineInstr *CCMI = MRI.getVRegDef(CondReg); 1825 // Try to select the G_BRCOND using whatever is feeding the condition if 1826 // possible. 1827 unsigned CCMIOpc = CCMI->getOpcode(); 1828 if (CCMIOpc == TargetOpcode::G_FCMP) 1829 return selectCompareBranchFedByFCmp(I, *CCMI, MIB); 1830 if (CCMIOpc == TargetOpcode::G_ICMP) 1831 return selectCompareBranchFedByICmp(I, *CCMI, MIB); 1832 1833 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z 1834 // instructions will not be produced, as they are conditional branch 1835 // instructions that do not set flags. 1836 if (ProduceNonFlagSettingCondBr) { 1837 emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true, 1838 I.getOperand(1).getMBB(), MIB); 1839 I.eraseFromParent(); 1840 return true; 1841 } 1842 1843 // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead. 1844 auto TstMI = 1845 MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1); 1846 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 1847 auto Bcc = MIB.buildInstr(AArch64::Bcc) 1848 .addImm(AArch64CC::NE) 1849 .addMBB(I.getOperand(1).getMBB()); 1850 I.eraseFromParent(); 1851 return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI); 1852 } 1853 1854 /// Returns the element immediate value of a vector shift operand if found. 1855 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR. 1856 static std::optional<int64_t> getVectorShiftImm(Register Reg, 1857 MachineRegisterInfo &MRI) { 1858 assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); 1859 MachineInstr *OpMI = MRI.getVRegDef(Reg); 1860 return getAArch64VectorSplatScalar(*OpMI, MRI); 1861 } 1862 1863 /// Matches and returns the shift immediate value for a SHL instruction given 1864 /// a shift operand. 1865 static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, 1866 MachineRegisterInfo &MRI) { 1867 std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI); 1868 if (!ShiftImm) 1869 return std::nullopt; 1870 // Check the immediate is in range for a SHL. 1871 int64_t Imm = *ShiftImm; 1872 if (Imm < 0) 1873 return std::nullopt; 1874 switch (SrcTy.getElementType().getSizeInBits()) { 1875 default: 1876 LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift"); 1877 return std::nullopt; 1878 case 8: 1879 if (Imm > 7) 1880 return std::nullopt; 1881 break; 1882 case 16: 1883 if (Imm > 15) 1884 return std::nullopt; 1885 break; 1886 case 32: 1887 if (Imm > 31) 1888 return std::nullopt; 1889 break; 1890 case 64: 1891 if (Imm > 63) 1892 return std::nullopt; 1893 break; 1894 } 1895 return Imm; 1896 } 1897 1898 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I, 1899 MachineRegisterInfo &MRI) { 1900 assert(I.getOpcode() == TargetOpcode::G_SHL); 1901 Register DstReg = I.getOperand(0).getReg(); 1902 const LLT Ty = MRI.getType(DstReg); 1903 Register Src1Reg = I.getOperand(1).getReg(); 1904 Register Src2Reg = I.getOperand(2).getReg(); 1905 1906 if (!Ty.isVector()) 1907 return false; 1908 1909 // Check if we have a vector of constants on RHS that we can select as the 1910 // immediate form. 1911 std::optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI); 1912 1913 unsigned Opc = 0; 1914 if (Ty == LLT::fixed_vector(2, 64)) { 1915 Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64; 1916 } else if (Ty == LLT::fixed_vector(4, 32)) { 1917 Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32; 1918 } else if (Ty == LLT::fixed_vector(2, 32)) { 1919 Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32; 1920 } else if (Ty == LLT::fixed_vector(4, 16)) { 1921 Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16; 1922 } else if (Ty == LLT::fixed_vector(8, 16)) { 1923 Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16; 1924 } else if (Ty == LLT::fixed_vector(16, 8)) { 1925 Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8; 1926 } else if (Ty == LLT::fixed_vector(8, 8)) { 1927 Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8; 1928 } else { 1929 LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); 1930 return false; 1931 } 1932 1933 auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg}); 1934 if (ImmVal) 1935 Shl.addImm(*ImmVal); 1936 else 1937 Shl.addUse(Src2Reg); 1938 constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI); 1939 I.eraseFromParent(); 1940 return true; 1941 } 1942 1943 bool AArch64InstructionSelector::selectVectorAshrLshr( 1944 MachineInstr &I, MachineRegisterInfo &MRI) { 1945 assert(I.getOpcode() == TargetOpcode::G_ASHR || 1946 I.getOpcode() == TargetOpcode::G_LSHR); 1947 Register DstReg = I.getOperand(0).getReg(); 1948 const LLT Ty = MRI.getType(DstReg); 1949 Register Src1Reg = I.getOperand(1).getReg(); 1950 Register Src2Reg = I.getOperand(2).getReg(); 1951 1952 if (!Ty.isVector()) 1953 return false; 1954 1955 bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR; 1956 1957 // We expect the immediate case to be lowered in the PostLegalCombiner to 1958 // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents. 1959 1960 // There is not a shift right register instruction, but the shift left 1961 // register instruction takes a signed value, where negative numbers specify a 1962 // right shift. 1963 1964 unsigned Opc = 0; 1965 unsigned NegOpc = 0; 1966 const TargetRegisterClass *RC = 1967 getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID)); 1968 if (Ty == LLT::fixed_vector(2, 64)) { 1969 Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64; 1970 NegOpc = AArch64::NEGv2i64; 1971 } else if (Ty == LLT::fixed_vector(4, 32)) { 1972 Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32; 1973 NegOpc = AArch64::NEGv4i32; 1974 } else if (Ty == LLT::fixed_vector(2, 32)) { 1975 Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32; 1976 NegOpc = AArch64::NEGv2i32; 1977 } else if (Ty == LLT::fixed_vector(4, 16)) { 1978 Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16; 1979 NegOpc = AArch64::NEGv4i16; 1980 } else if (Ty == LLT::fixed_vector(8, 16)) { 1981 Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16; 1982 NegOpc = AArch64::NEGv8i16; 1983 } else if (Ty == LLT::fixed_vector(16, 8)) { 1984 Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8; 1985 NegOpc = AArch64::NEGv16i8; 1986 } else if (Ty == LLT::fixed_vector(8, 8)) { 1987 Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8; 1988 NegOpc = AArch64::NEGv8i8; 1989 } else { 1990 LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type"); 1991 return false; 1992 } 1993 1994 auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg}); 1995 constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI); 1996 auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg}); 1997 constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI); 1998 I.eraseFromParent(); 1999 return true; 2000 } 2001 2002 bool AArch64InstructionSelector::selectVaStartAAPCS( 2003 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { 2004 2005 if (STI.isCallingConvWin64(MF.getFunction().getCallingConv(), 2006 MF.getFunction().isVarArg())) 2007 return false; 2008 2009 // The layout of the va_list struct is specified in the AArch64 Procedure Call 2010 // Standard, section 10.1.5. 2011 2012 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2013 const unsigned PtrSize = STI.isTargetILP32() ? 4 : 8; 2014 const auto *PtrRegClass = 2015 STI.isTargetILP32() ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass; 2016 2017 const MCInstrDesc &MCIDAddAddr = 2018 TII.get(STI.isTargetILP32() ? AArch64::ADDWri : AArch64::ADDXri); 2019 const MCInstrDesc &MCIDStoreAddr = 2020 TII.get(STI.isTargetILP32() ? AArch64::STRWui : AArch64::STRXui); 2021 2022 /* 2023 * typedef struct va_list { 2024 * void * stack; // next stack param 2025 * void * gr_top; // end of GP arg reg save area 2026 * void * vr_top; // end of FP/SIMD arg reg save area 2027 * int gr_offs; // offset from gr_top to next GP register arg 2028 * int vr_offs; // offset from vr_top to next FP/SIMD register arg 2029 * } va_list; 2030 */ 2031 const auto VAList = I.getOperand(0).getReg(); 2032 2033 // Our current offset in bytes from the va_list struct (VAList). 2034 unsigned OffsetBytes = 0; 2035 2036 // Helper function to store (FrameIndex + Imm) to VAList at offset OffsetBytes 2037 // and increment OffsetBytes by PtrSize. 2038 const auto PushAddress = [&](const int FrameIndex, const int64_t Imm) { 2039 const Register Top = MRI.createVirtualRegister(PtrRegClass); 2040 auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDAddAddr) 2041 .addDef(Top) 2042 .addFrameIndex(FrameIndex) 2043 .addImm(Imm) 2044 .addImm(0); 2045 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2046 2047 const auto *MMO = *I.memoperands_begin(); 2048 MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDStoreAddr) 2049 .addUse(Top) 2050 .addUse(VAList) 2051 .addImm(OffsetBytes / PtrSize) 2052 .addMemOperand(MF.getMachineMemOperand( 2053 MMO->getPointerInfo().getWithOffset(OffsetBytes), 2054 MachineMemOperand::MOStore, PtrSize, MMO->getBaseAlign())); 2055 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2056 2057 OffsetBytes += PtrSize; 2058 }; 2059 2060 // void* stack at offset 0 2061 PushAddress(FuncInfo->getVarArgsStackIndex(), 0); 2062 2063 // void* gr_top at offset 8 (4 on ILP32) 2064 const unsigned GPRSize = FuncInfo->getVarArgsGPRSize(); 2065 PushAddress(FuncInfo->getVarArgsGPRIndex(), GPRSize); 2066 2067 // void* vr_top at offset 16 (8 on ILP32) 2068 const unsigned FPRSize = FuncInfo->getVarArgsFPRSize(); 2069 PushAddress(FuncInfo->getVarArgsFPRIndex(), FPRSize); 2070 2071 // Helper function to store a 4-byte integer constant to VAList at offset 2072 // OffsetBytes, and increment OffsetBytes by 4. 2073 const auto PushIntConstant = [&](const int32_t Value) { 2074 constexpr int IntSize = 4; 2075 const Register Temp = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 2076 auto MIB = 2077 BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::MOVi32imm)) 2078 .addDef(Temp) 2079 .addImm(Value); 2080 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2081 2082 const auto *MMO = *I.memoperands_begin(); 2083 MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRWui)) 2084 .addUse(Temp) 2085 .addUse(VAList) 2086 .addImm(OffsetBytes / IntSize) 2087 .addMemOperand(MF.getMachineMemOperand( 2088 MMO->getPointerInfo().getWithOffset(OffsetBytes), 2089 MachineMemOperand::MOStore, IntSize, MMO->getBaseAlign())); 2090 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2091 OffsetBytes += IntSize; 2092 }; 2093 2094 // int gr_offs at offset 24 (12 on ILP32) 2095 PushIntConstant(-static_cast<int32_t>(GPRSize)); 2096 2097 // int vr_offs at offset 28 (16 on ILP32) 2098 PushIntConstant(-static_cast<int32_t>(FPRSize)); 2099 2100 assert(OffsetBytes == (STI.isTargetILP32() ? 20 : 32) && "Unexpected offset"); 2101 2102 I.eraseFromParent(); 2103 return true; 2104 } 2105 2106 bool AArch64InstructionSelector::selectVaStartDarwin( 2107 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { 2108 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2109 Register ListReg = I.getOperand(0).getReg(); 2110 2111 Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 2112 2113 int FrameIdx = FuncInfo->getVarArgsStackIndex(); 2114 if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64( 2115 MF.getFunction().getCallingConv(), MF.getFunction().isVarArg())) { 2116 FrameIdx = FuncInfo->getVarArgsGPRSize() > 0 2117 ? FuncInfo->getVarArgsGPRIndex() 2118 : FuncInfo->getVarArgsStackIndex(); 2119 } 2120 2121 auto MIB = 2122 BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri)) 2123 .addDef(ArgsAddrReg) 2124 .addFrameIndex(FrameIdx) 2125 .addImm(0) 2126 .addImm(0); 2127 2128 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2129 2130 MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui)) 2131 .addUse(ArgsAddrReg) 2132 .addUse(ListReg) 2133 .addImm(0) 2134 .addMemOperand(*I.memoperands_begin()); 2135 2136 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2137 I.eraseFromParent(); 2138 return true; 2139 } 2140 2141 void AArch64InstructionSelector::materializeLargeCMVal( 2142 MachineInstr &I, const Value *V, unsigned OpFlags) { 2143 MachineBasicBlock &MBB = *I.getParent(); 2144 MachineFunction &MF = *MBB.getParent(); 2145 MachineRegisterInfo &MRI = MF.getRegInfo(); 2146 2147 auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {}); 2148 MovZ->addOperand(MF, I.getOperand(1)); 2149 MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | 2150 AArch64II::MO_NC); 2151 MovZ->addOperand(MF, MachineOperand::CreateImm(0)); 2152 constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); 2153 2154 auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset, 2155 Register ForceDstReg) { 2156 Register DstReg = ForceDstReg 2157 ? ForceDstReg 2158 : MRI.createVirtualRegister(&AArch64::GPR64RegClass); 2159 auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg); 2160 if (auto *GV = dyn_cast<GlobalValue>(V)) { 2161 MovI->addOperand(MF, MachineOperand::CreateGA( 2162 GV, MovZ->getOperand(1).getOffset(), Flags)); 2163 } else { 2164 MovI->addOperand( 2165 MF, MachineOperand::CreateBA(cast<BlockAddress>(V), 2166 MovZ->getOperand(1).getOffset(), Flags)); 2167 } 2168 MovI->addOperand(MF, MachineOperand::CreateImm(Offset)); 2169 constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); 2170 return DstReg; 2171 }; 2172 Register DstReg = BuildMovK(MovZ.getReg(0), 2173 AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); 2174 DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); 2175 BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); 2176 } 2177 2178 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { 2179 MachineBasicBlock &MBB = *I.getParent(); 2180 MachineFunction &MF = *MBB.getParent(); 2181 MachineRegisterInfo &MRI = MF.getRegInfo(); 2182 2183 switch (I.getOpcode()) { 2184 case TargetOpcode::G_STORE: { 2185 bool Changed = contractCrossBankCopyIntoStore(I, MRI); 2186 MachineOperand &SrcOp = I.getOperand(0); 2187 if (MRI.getType(SrcOp.getReg()).isPointer()) { 2188 // Allow matching with imported patterns for stores of pointers. Unlike 2189 // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy 2190 // and constrain. 2191 auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp); 2192 Register NewSrc = Copy.getReg(0); 2193 SrcOp.setReg(NewSrc); 2194 RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI); 2195 Changed = true; 2196 } 2197 return Changed; 2198 } 2199 case TargetOpcode::G_PTR_ADD: 2200 return convertPtrAddToAdd(I, MRI); 2201 case TargetOpcode::G_LOAD: { 2202 // For scalar loads of pointers, we try to convert the dest type from p0 2203 // to s64 so that our imported patterns can match. Like with the G_PTR_ADD 2204 // conversion, this should be ok because all users should have been 2205 // selected already, so the type doesn't matter for them. 2206 Register DstReg = I.getOperand(0).getReg(); 2207 const LLT DstTy = MRI.getType(DstReg); 2208 if (!DstTy.isPointer()) 2209 return false; 2210 MRI.setType(DstReg, LLT::scalar(64)); 2211 return true; 2212 } 2213 case AArch64::G_DUP: { 2214 // Convert the type from p0 to s64 to help selection. 2215 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2216 if (!DstTy.isPointerVector()) 2217 return false; 2218 auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg()); 2219 MRI.setType(I.getOperand(0).getReg(), 2220 DstTy.changeElementType(LLT::scalar(64))); 2221 MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass); 2222 I.getOperand(1).setReg(NewSrc.getReg(0)); 2223 return true; 2224 } 2225 case AArch64::G_INSERT_VECTOR_ELT: { 2226 // Convert the type from p0 to s64 to help selection. 2227 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2228 LLT SrcVecTy = MRI.getType(I.getOperand(1).getReg()); 2229 if (!SrcVecTy.isPointerVector()) 2230 return false; 2231 auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(2).getReg()); 2232 MRI.setType(I.getOperand(1).getReg(), 2233 DstTy.changeElementType(LLT::scalar(64))); 2234 MRI.setType(I.getOperand(0).getReg(), 2235 DstTy.changeElementType(LLT::scalar(64))); 2236 MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass); 2237 I.getOperand(2).setReg(NewSrc.getReg(0)); 2238 return true; 2239 } 2240 case TargetOpcode::G_UITOFP: 2241 case TargetOpcode::G_SITOFP: { 2242 // If both source and destination regbanks are FPR, then convert the opcode 2243 // to G_SITOF so that the importer can select it to an fpr variant. 2244 // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank 2245 // copy. 2246 Register SrcReg = I.getOperand(1).getReg(); 2247 LLT SrcTy = MRI.getType(SrcReg); 2248 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2249 if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits()) 2250 return false; 2251 2252 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) { 2253 if (I.getOpcode() == TargetOpcode::G_SITOFP) 2254 I.setDesc(TII.get(AArch64::G_SITOF)); 2255 else 2256 I.setDesc(TII.get(AArch64::G_UITOF)); 2257 return true; 2258 } 2259 return false; 2260 } 2261 default: 2262 return false; 2263 } 2264 } 2265 2266 /// This lowering tries to look for G_PTR_ADD instructions and then converts 2267 /// them to a standard G_ADD with a COPY on the source. 2268 /// 2269 /// The motivation behind this is to expose the add semantics to the imported 2270 /// tablegen patterns. We shouldn't need to check for uses being loads/stores, 2271 /// because the selector works bottom up, uses before defs. By the time we 2272 /// end up trying to select a G_PTR_ADD, we should have already attempted to 2273 /// fold this into addressing modes and were therefore unsuccessful. 2274 bool AArch64InstructionSelector::convertPtrAddToAdd( 2275 MachineInstr &I, MachineRegisterInfo &MRI) { 2276 assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD"); 2277 Register DstReg = I.getOperand(0).getReg(); 2278 Register AddOp1Reg = I.getOperand(1).getReg(); 2279 const LLT PtrTy = MRI.getType(DstReg); 2280 if (PtrTy.getAddressSpace() != 0) 2281 return false; 2282 2283 const LLT CastPtrTy = 2284 PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64); 2285 auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg); 2286 // Set regbanks on the registers. 2287 if (PtrTy.isVector()) 2288 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID)); 2289 else 2290 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); 2291 2292 // Now turn the %dst(p0) = G_PTR_ADD %base, off into: 2293 // %dst(intty) = G_ADD %intbase, off 2294 I.setDesc(TII.get(TargetOpcode::G_ADD)); 2295 MRI.setType(DstReg, CastPtrTy); 2296 I.getOperand(1).setReg(PtrToInt.getReg(0)); 2297 if (!select(*PtrToInt)) { 2298 LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd"); 2299 return false; 2300 } 2301 2302 // Also take the opportunity here to try to do some optimization. 2303 // Try to convert this into a G_SUB if the offset is a 0-x negate idiom. 2304 Register NegatedReg; 2305 if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg)))) 2306 return true; 2307 I.getOperand(2).setReg(NegatedReg); 2308 I.setDesc(TII.get(TargetOpcode::G_SUB)); 2309 return true; 2310 } 2311 2312 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I, 2313 MachineRegisterInfo &MRI) { 2314 // We try to match the immediate variant of LSL, which is actually an alias 2315 // for a special case of UBFM. Otherwise, we fall back to the imported 2316 // selector which will match the register variant. 2317 assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op"); 2318 const auto &MO = I.getOperand(2); 2319 auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI); 2320 if (!VRegAndVal) 2321 return false; 2322 2323 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2324 if (DstTy.isVector()) 2325 return false; 2326 bool Is64Bit = DstTy.getSizeInBits() == 64; 2327 auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO); 2328 auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO); 2329 2330 if (!Imm1Fn || !Imm2Fn) 2331 return false; 2332 2333 auto NewI = 2334 MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri, 2335 {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()}); 2336 2337 for (auto &RenderFn : *Imm1Fn) 2338 RenderFn(NewI); 2339 for (auto &RenderFn : *Imm2Fn) 2340 RenderFn(NewI); 2341 2342 I.eraseFromParent(); 2343 return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); 2344 } 2345 2346 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore( 2347 MachineInstr &I, MachineRegisterInfo &MRI) { 2348 assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE"); 2349 // If we're storing a scalar, it doesn't matter what register bank that 2350 // scalar is on. All that matters is the size. 2351 // 2352 // So, if we see something like this (with a 32-bit scalar as an example): 2353 // 2354 // %x:gpr(s32) = ... something ... 2355 // %y:fpr(s32) = COPY %x:gpr(s32) 2356 // G_STORE %y:fpr(s32) 2357 // 2358 // We can fix this up into something like this: 2359 // 2360 // G_STORE %x:gpr(s32) 2361 // 2362 // And then continue the selection process normally. 2363 Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI); 2364 if (!DefDstReg.isValid()) 2365 return false; 2366 LLT DefDstTy = MRI.getType(DefDstReg); 2367 Register StoreSrcReg = I.getOperand(0).getReg(); 2368 LLT StoreSrcTy = MRI.getType(StoreSrcReg); 2369 2370 // If we get something strange like a physical register, then we shouldn't 2371 // go any further. 2372 if (!DefDstTy.isValid()) 2373 return false; 2374 2375 // Are the source and dst types the same size? 2376 if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits()) 2377 return false; 2378 2379 if (RBI.getRegBank(StoreSrcReg, MRI, TRI) == 2380 RBI.getRegBank(DefDstReg, MRI, TRI)) 2381 return false; 2382 2383 // We have a cross-bank copy, which is entering a store. Let's fold it. 2384 I.getOperand(0).setReg(DefDstReg); 2385 return true; 2386 } 2387 2388 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) { 2389 assert(I.getParent() && "Instruction should be in a basic block!"); 2390 assert(I.getParent()->getParent() && "Instruction should be in a function!"); 2391 2392 MachineBasicBlock &MBB = *I.getParent(); 2393 MachineFunction &MF = *MBB.getParent(); 2394 MachineRegisterInfo &MRI = MF.getRegInfo(); 2395 2396 switch (I.getOpcode()) { 2397 case AArch64::G_DUP: { 2398 // Before selecting a DUP instruction, check if it is better selected as a 2399 // MOV or load from a constant pool. 2400 Register Src = I.getOperand(1).getReg(); 2401 auto ValAndVReg = getAnyConstantVRegValWithLookThrough(Src, MRI); 2402 if (!ValAndVReg) 2403 return false; 2404 LLVMContext &Ctx = MF.getFunction().getContext(); 2405 Register Dst = I.getOperand(0).getReg(); 2406 auto *CV = ConstantDataVector::getSplat( 2407 MRI.getType(Dst).getNumElements(), 2408 ConstantInt::get( 2409 Type::getIntNTy(Ctx, MRI.getType(Dst).getScalarSizeInBits()), 2410 ValAndVReg->Value.trunc(MRI.getType(Dst).getScalarSizeInBits()))); 2411 if (!emitConstantVector(Dst, CV, MIB, MRI)) 2412 return false; 2413 I.eraseFromParent(); 2414 return true; 2415 } 2416 case TargetOpcode::G_SEXT: 2417 // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV 2418 // over a normal extend. 2419 if (selectUSMovFromExtend(I, MRI)) 2420 return true; 2421 return false; 2422 case TargetOpcode::G_BR: 2423 return false; 2424 case TargetOpcode::G_SHL: 2425 return earlySelectSHL(I, MRI); 2426 case TargetOpcode::G_CONSTANT: { 2427 bool IsZero = false; 2428 if (I.getOperand(1).isCImm()) 2429 IsZero = I.getOperand(1).getCImm()->isZero(); 2430 else if (I.getOperand(1).isImm()) 2431 IsZero = I.getOperand(1).getImm() == 0; 2432 2433 if (!IsZero) 2434 return false; 2435 2436 Register DefReg = I.getOperand(0).getReg(); 2437 LLT Ty = MRI.getType(DefReg); 2438 if (Ty.getSizeInBits() == 64) { 2439 I.getOperand(1).ChangeToRegister(AArch64::XZR, false); 2440 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 2441 } else if (Ty.getSizeInBits() == 32) { 2442 I.getOperand(1).ChangeToRegister(AArch64::WZR, false); 2443 RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI); 2444 } else 2445 return false; 2446 2447 I.setDesc(TII.get(TargetOpcode::COPY)); 2448 return true; 2449 } 2450 2451 case TargetOpcode::G_ADD: { 2452 // Check if this is being fed by a G_ICMP on either side. 2453 // 2454 // (cmp pred, x, y) + z 2455 // 2456 // In the above case, when the cmp is true, we increment z by 1. So, we can 2457 // fold the add into the cset for the cmp by using cinc. 2458 // 2459 // FIXME: This would probably be a lot nicer in PostLegalizerLowering. 2460 Register AddDst = I.getOperand(0).getReg(); 2461 Register AddLHS = I.getOperand(1).getReg(); 2462 Register AddRHS = I.getOperand(2).getReg(); 2463 // Only handle scalars. 2464 LLT Ty = MRI.getType(AddLHS); 2465 if (Ty.isVector()) 2466 return false; 2467 // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64 2468 // bits. 2469 unsigned Size = Ty.getSizeInBits(); 2470 if (Size != 32 && Size != 64) 2471 return false; 2472 auto MatchCmp = [&](Register Reg) -> MachineInstr * { 2473 if (!MRI.hasOneNonDBGUse(Reg)) 2474 return nullptr; 2475 // If the LHS of the add is 32 bits, then we want to fold a 32-bit 2476 // compare. 2477 if (Size == 32) 2478 return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI); 2479 // We model scalar compares using 32-bit destinations right now. 2480 // If it's a 64-bit compare, it'll have 64-bit sources. 2481 Register ZExt; 2482 if (!mi_match(Reg, MRI, 2483 m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt)))))) 2484 return nullptr; 2485 auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI); 2486 if (!Cmp || 2487 MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64) 2488 return nullptr; 2489 return Cmp; 2490 }; 2491 // Try to match 2492 // z + (cmp pred, x, y) 2493 MachineInstr *Cmp = MatchCmp(AddRHS); 2494 if (!Cmp) { 2495 // (cmp pred, x, y) + z 2496 std::swap(AddLHS, AddRHS); 2497 Cmp = MatchCmp(AddRHS); 2498 if (!Cmp) 2499 return false; 2500 } 2501 auto &PredOp = Cmp->getOperand(1); 2502 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); 2503 const AArch64CC::CondCode InvCC = 2504 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); 2505 MIB.setInstrAndDebugLoc(I); 2506 emitIntegerCompare(/*LHS=*/Cmp->getOperand(2), 2507 /*RHS=*/Cmp->getOperand(3), PredOp, MIB); 2508 emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB); 2509 I.eraseFromParent(); 2510 return true; 2511 } 2512 case TargetOpcode::G_OR: { 2513 // Look for operations that take the lower `Width=Size-ShiftImm` bits of 2514 // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via 2515 // shifting and masking that we can replace with a BFI (encoded as a BFM). 2516 Register Dst = I.getOperand(0).getReg(); 2517 LLT Ty = MRI.getType(Dst); 2518 2519 if (!Ty.isScalar()) 2520 return false; 2521 2522 unsigned Size = Ty.getSizeInBits(); 2523 if (Size != 32 && Size != 64) 2524 return false; 2525 2526 Register ShiftSrc; 2527 int64_t ShiftImm; 2528 Register MaskSrc; 2529 int64_t MaskImm; 2530 if (!mi_match( 2531 Dst, MRI, 2532 m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))), 2533 m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm)))))) 2534 return false; 2535 2536 if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm)) 2537 return false; 2538 2539 int64_t Immr = Size - ShiftImm; 2540 int64_t Imms = Size - ShiftImm - 1; 2541 unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri; 2542 emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB); 2543 I.eraseFromParent(); 2544 return true; 2545 } 2546 case TargetOpcode::G_FENCE: { 2547 if (I.getOperand(1).getImm() == 0) 2548 BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER)); 2549 else 2550 BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB)) 2551 .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb); 2552 I.eraseFromParent(); 2553 return true; 2554 } 2555 default: 2556 return false; 2557 } 2558 } 2559 2560 bool AArch64InstructionSelector::select(MachineInstr &I) { 2561 assert(I.getParent() && "Instruction should be in a basic block!"); 2562 assert(I.getParent()->getParent() && "Instruction should be in a function!"); 2563 2564 MachineBasicBlock &MBB = *I.getParent(); 2565 MachineFunction &MF = *MBB.getParent(); 2566 MachineRegisterInfo &MRI = MF.getRegInfo(); 2567 2568 const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>(); 2569 if (Subtarget->requiresStrictAlign()) { 2570 // We don't support this feature yet. 2571 LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n"); 2572 return false; 2573 } 2574 2575 MIB.setInstrAndDebugLoc(I); 2576 2577 unsigned Opcode = I.getOpcode(); 2578 // G_PHI requires same handling as PHI 2579 if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) { 2580 // Certain non-generic instructions also need some special handling. 2581 2582 if (Opcode == TargetOpcode::LOAD_STACK_GUARD) 2583 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2584 2585 if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) { 2586 const Register DefReg = I.getOperand(0).getReg(); 2587 const LLT DefTy = MRI.getType(DefReg); 2588 2589 const RegClassOrRegBank &RegClassOrBank = 2590 MRI.getRegClassOrRegBank(DefReg); 2591 2592 const TargetRegisterClass *DefRC = 2593 dyn_cast<const TargetRegisterClass *>(RegClassOrBank); 2594 if (!DefRC) { 2595 if (!DefTy.isValid()) { 2596 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 2597 return false; 2598 } 2599 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank); 2600 DefRC = getRegClassForTypeOnBank(DefTy, RB); 2601 if (!DefRC) { 2602 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 2603 return false; 2604 } 2605 } 2606 2607 I.setDesc(TII.get(TargetOpcode::PHI)); 2608 2609 return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); 2610 } 2611 2612 if (I.isCopy()) 2613 return selectCopy(I, TII, MRI, TRI, RBI); 2614 2615 if (I.isDebugInstr()) 2616 return selectDebugInstr(I, MRI, RBI); 2617 2618 return true; 2619 } 2620 2621 2622 if (I.getNumOperands() != I.getNumExplicitOperands()) { 2623 LLVM_DEBUG( 2624 dbgs() << "Generic instruction has unexpected implicit operands\n"); 2625 return false; 2626 } 2627 2628 // Try to do some lowering before we start instruction selecting. These 2629 // lowerings are purely transformations on the input G_MIR and so selection 2630 // must continue after any modification of the instruction. 2631 if (preISelLower(I)) { 2632 Opcode = I.getOpcode(); // The opcode may have been modified, refresh it. 2633 } 2634 2635 // There may be patterns where the importer can't deal with them optimally, 2636 // but does select it to a suboptimal sequence so our custom C++ selection 2637 // code later never has a chance to work on it. Therefore, we have an early 2638 // selection attempt here to give priority to certain selection routines 2639 // over the imported ones. 2640 if (earlySelect(I)) 2641 return true; 2642 2643 if (selectImpl(I, *CoverageInfo)) 2644 return true; 2645 2646 LLT Ty = 2647 I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{}; 2648 2649 switch (Opcode) { 2650 case TargetOpcode::G_SBFX: 2651 case TargetOpcode::G_UBFX: { 2652 static const unsigned OpcTable[2][2] = { 2653 {AArch64::UBFMWri, AArch64::UBFMXri}, 2654 {AArch64::SBFMWri, AArch64::SBFMXri}}; 2655 bool IsSigned = Opcode == TargetOpcode::G_SBFX; 2656 unsigned Size = Ty.getSizeInBits(); 2657 unsigned Opc = OpcTable[IsSigned][Size == 64]; 2658 auto Cst1 = 2659 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI); 2660 assert(Cst1 && "Should have gotten a constant for src 1?"); 2661 auto Cst2 = 2662 getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI); 2663 assert(Cst2 && "Should have gotten a constant for src 2?"); 2664 auto LSB = Cst1->Value.getZExtValue(); 2665 auto Width = Cst2->Value.getZExtValue(); 2666 auto BitfieldInst = 2667 MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)}) 2668 .addImm(LSB) 2669 .addImm(LSB + Width - 1); 2670 I.eraseFromParent(); 2671 return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI); 2672 } 2673 case TargetOpcode::G_BRCOND: 2674 return selectCompareBranch(I, MF, MRI); 2675 2676 case TargetOpcode::G_BRINDIRECT: { 2677 const Function &Fn = MF.getFunction(); 2678 if (std::optional<uint16_t> BADisc = 2679 STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(Fn)) { 2680 auto MI = MIB.buildInstr(AArch64::BRA, {}, {I.getOperand(0).getReg()}); 2681 MI.addImm(AArch64PACKey::IA); 2682 MI.addImm(*BADisc); 2683 MI.addReg(/*AddrDisc=*/AArch64::XZR); 2684 I.eraseFromParent(); 2685 return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); 2686 } 2687 I.setDesc(TII.get(AArch64::BR)); 2688 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2689 } 2690 2691 case TargetOpcode::G_BRJT: 2692 return selectBrJT(I, MRI); 2693 2694 case AArch64::G_ADD_LOW: { 2695 // This op may have been separated from it's ADRP companion by the localizer 2696 // or some other code motion pass. Given that many CPUs will try to 2697 // macro fuse these operations anyway, select this into a MOVaddr pseudo 2698 // which will later be expanded into an ADRP+ADD pair after scheduling. 2699 MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg()); 2700 if (BaseMI->getOpcode() != AArch64::ADRP) { 2701 I.setDesc(TII.get(AArch64::ADDXri)); 2702 I.addOperand(MachineOperand::CreateImm(0)); 2703 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2704 } 2705 assert(TM.getCodeModel() == CodeModel::Small && 2706 "Expected small code model"); 2707 auto Op1 = BaseMI->getOperand(1); 2708 auto Op2 = I.getOperand(2); 2709 auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {}) 2710 .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(), 2711 Op1.getTargetFlags()) 2712 .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(), 2713 Op2.getTargetFlags()); 2714 I.eraseFromParent(); 2715 return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI); 2716 } 2717 2718 case TargetOpcode::G_FCONSTANT: 2719 case TargetOpcode::G_CONSTANT: { 2720 const bool isFP = Opcode == TargetOpcode::G_FCONSTANT; 2721 2722 const LLT s8 = LLT::scalar(8); 2723 const LLT s16 = LLT::scalar(16); 2724 const LLT s32 = LLT::scalar(32); 2725 const LLT s64 = LLT::scalar(64); 2726 const LLT s128 = LLT::scalar(128); 2727 const LLT p0 = LLT::pointer(0, 64); 2728 2729 const Register DefReg = I.getOperand(0).getReg(); 2730 const LLT DefTy = MRI.getType(DefReg); 2731 const unsigned DefSize = DefTy.getSizeInBits(); 2732 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 2733 2734 // FIXME: Redundant check, but even less readable when factored out. 2735 if (isFP) { 2736 if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) { 2737 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty 2738 << " constant, expected: " << s16 << " or " << s32 2739 << " or " << s64 << " or " << s128 << '\n'); 2740 return false; 2741 } 2742 2743 if (RB.getID() != AArch64::FPRRegBankID) { 2744 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty 2745 << " constant on bank: " << RB 2746 << ", expected: FPR\n"); 2747 return false; 2748 } 2749 2750 // The case when we have 0.0 is covered by tablegen. Reject it here so we 2751 // can be sure tablegen works correctly and isn't rescued by this code. 2752 // 0.0 is not covered by tablegen for FP128. So we will handle this 2753 // scenario in the code here. 2754 if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0)) 2755 return false; 2756 } else { 2757 // s32 and s64 are covered by tablegen. 2758 if (Ty != p0 && Ty != s8 && Ty != s16) { 2759 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty 2760 << " constant, expected: " << s32 << ", " << s64 2761 << ", or " << p0 << '\n'); 2762 return false; 2763 } 2764 2765 if (RB.getID() != AArch64::GPRRegBankID) { 2766 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty 2767 << " constant on bank: " << RB 2768 << ", expected: GPR\n"); 2769 return false; 2770 } 2771 } 2772 2773 if (isFP) { 2774 const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB); 2775 // For 16, 64, and 128b values, emit a constant pool load. 2776 switch (DefSize) { 2777 default: 2778 llvm_unreachable("Unexpected destination size for G_FCONSTANT?"); 2779 case 32: 2780 case 64: { 2781 bool OptForSize = shouldOptForSize(&MF); 2782 const auto &TLI = MF.getSubtarget().getTargetLowering(); 2783 // If TLI says that this fpimm is illegal, then we'll expand to a 2784 // constant pool load. 2785 if (TLI->isFPImmLegal(I.getOperand(1).getFPImm()->getValueAPF(), 2786 EVT::getFloatingPointVT(DefSize), OptForSize)) 2787 break; 2788 [[fallthrough]]; 2789 } 2790 case 16: 2791 case 128: { 2792 auto *FPImm = I.getOperand(1).getFPImm(); 2793 auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB); 2794 if (!LoadMI) { 2795 LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n"); 2796 return false; 2797 } 2798 MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()}); 2799 I.eraseFromParent(); 2800 return RBI.constrainGenericRegister(DefReg, FPRRC, MRI); 2801 } 2802 } 2803 2804 assert((DefSize == 32 || DefSize == 64) && "Unexpected const def size"); 2805 // Either emit a FMOV, or emit a copy to emit a normal mov. 2806 const Register DefGPRReg = MRI.createVirtualRegister( 2807 DefSize == 32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass); 2808 MachineOperand &RegOp = I.getOperand(0); 2809 RegOp.setReg(DefGPRReg); 2810 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); 2811 MIB.buildCopy({DefReg}, {DefGPRReg}); 2812 2813 if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) { 2814 LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n"); 2815 return false; 2816 } 2817 2818 MachineOperand &ImmOp = I.getOperand(1); 2819 // FIXME: Is going through int64_t always correct? 2820 ImmOp.ChangeToImmediate( 2821 ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 2822 } else if (I.getOperand(1).isCImm()) { 2823 uint64_t Val = I.getOperand(1).getCImm()->getZExtValue(); 2824 I.getOperand(1).ChangeToImmediate(Val); 2825 } else if (I.getOperand(1).isImm()) { 2826 uint64_t Val = I.getOperand(1).getImm(); 2827 I.getOperand(1).ChangeToImmediate(Val); 2828 } 2829 2830 const unsigned MovOpc = 2831 DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm; 2832 I.setDesc(TII.get(MovOpc)); 2833 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2834 return true; 2835 } 2836 case TargetOpcode::G_EXTRACT: { 2837 Register DstReg = I.getOperand(0).getReg(); 2838 Register SrcReg = I.getOperand(1).getReg(); 2839 LLT SrcTy = MRI.getType(SrcReg); 2840 LLT DstTy = MRI.getType(DstReg); 2841 (void)DstTy; 2842 unsigned SrcSize = SrcTy.getSizeInBits(); 2843 2844 if (SrcTy.getSizeInBits() > 64) { 2845 // This should be an extract of an s128, which is like a vector extract. 2846 if (SrcTy.getSizeInBits() != 128) 2847 return false; 2848 // Only support extracting 64 bits from an s128 at the moment. 2849 if (DstTy.getSizeInBits() != 64) 2850 return false; 2851 2852 unsigned Offset = I.getOperand(2).getImm(); 2853 if (Offset % 64 != 0) 2854 return false; 2855 2856 // Check we have the right regbank always. 2857 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); 2858 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 2859 assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!"); 2860 2861 if (SrcRB.getID() == AArch64::GPRRegBankID) { 2862 auto NewI = 2863 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 2864 .addUse(SrcReg, 0, 2865 Offset == 0 ? AArch64::sube64 : AArch64::subo64); 2866 constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI, 2867 AArch64::GPR64RegClass, NewI->getOperand(0)); 2868 I.eraseFromParent(); 2869 return true; 2870 } 2871 2872 // Emit the same code as a vector extract. 2873 // Offset must be a multiple of 64. 2874 unsigned LaneIdx = Offset / 64; 2875 MachineInstr *Extract = emitExtractVectorElt( 2876 DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB); 2877 if (!Extract) 2878 return false; 2879 I.eraseFromParent(); 2880 return true; 2881 } 2882 2883 I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri)); 2884 MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + 2885 Ty.getSizeInBits() - 1); 2886 2887 if (SrcSize < 64) { 2888 assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 && 2889 "unexpected G_EXTRACT types"); 2890 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2891 } 2892 2893 DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 2894 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); 2895 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) 2896 .addReg(DstReg, 0, AArch64::sub_32); 2897 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 2898 AArch64::GPR32RegClass, MRI); 2899 I.getOperand(0).setReg(DstReg); 2900 2901 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2902 } 2903 2904 case TargetOpcode::G_INSERT: { 2905 LLT SrcTy = MRI.getType(I.getOperand(2).getReg()); 2906 LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 2907 unsigned DstSize = DstTy.getSizeInBits(); 2908 // Larger inserts are vectors, same-size ones should be something else by 2909 // now (split up or turned into COPYs). 2910 if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32) 2911 return false; 2912 2913 I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri)); 2914 unsigned LSB = I.getOperand(3).getImm(); 2915 unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); 2916 I.getOperand(3).setImm((DstSize - LSB) % DstSize); 2917 MachineInstrBuilder(MF, I).addImm(Width - 1); 2918 2919 if (DstSize < 64) { 2920 assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 && 2921 "unexpected G_INSERT types"); 2922 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2923 } 2924 2925 Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 2926 BuildMI(MBB, I.getIterator(), I.getDebugLoc(), 2927 TII.get(AArch64::SUBREG_TO_REG)) 2928 .addDef(SrcReg) 2929 .addImm(0) 2930 .addUse(I.getOperand(2).getReg()) 2931 .addImm(AArch64::sub_32); 2932 RBI.constrainGenericRegister(I.getOperand(2).getReg(), 2933 AArch64::GPR32RegClass, MRI); 2934 I.getOperand(2).setReg(SrcReg); 2935 2936 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2937 } 2938 case TargetOpcode::G_FRAME_INDEX: { 2939 // allocas and G_FRAME_INDEX are only supported in addrspace(0). 2940 if (Ty != LLT::pointer(0, 64)) { 2941 LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty 2942 << ", expected: " << LLT::pointer(0, 64) << '\n'); 2943 return false; 2944 } 2945 I.setDesc(TII.get(AArch64::ADDXri)); 2946 2947 // MOs for a #0 shifted immediate. 2948 I.addOperand(MachineOperand::CreateImm(0)); 2949 I.addOperand(MachineOperand::CreateImm(0)); 2950 2951 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2952 } 2953 2954 case TargetOpcode::G_GLOBAL_VALUE: { 2955 const GlobalValue *GV = nullptr; 2956 unsigned OpFlags; 2957 if (I.getOperand(1).isSymbol()) { 2958 OpFlags = I.getOperand(1).getTargetFlags(); 2959 // Currently only used by "RtLibUseGOT". 2960 assert(OpFlags == AArch64II::MO_GOT); 2961 } else { 2962 GV = I.getOperand(1).getGlobal(); 2963 if (GV->isThreadLocal()) 2964 return selectTLSGlobalValue(I, MRI); 2965 OpFlags = STI.ClassifyGlobalReference(GV, TM); 2966 } 2967 2968 if (OpFlags & AArch64II::MO_GOT) { 2969 I.setDesc(TII.get(MF.getInfo<AArch64FunctionInfo>()->hasELFSignedGOT() 2970 ? AArch64::LOADgotAUTH 2971 : AArch64::LOADgot)); 2972 I.getOperand(1).setTargetFlags(OpFlags); 2973 } else if (TM.getCodeModel() == CodeModel::Large && 2974 !TM.isPositionIndependent()) { 2975 // Materialize the global using movz/movk instructions. 2976 materializeLargeCMVal(I, GV, OpFlags); 2977 I.eraseFromParent(); 2978 return true; 2979 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2980 I.setDesc(TII.get(AArch64::ADR)); 2981 I.getOperand(1).setTargetFlags(OpFlags); 2982 } else { 2983 I.setDesc(TII.get(AArch64::MOVaddr)); 2984 I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE); 2985 MachineInstrBuilder MIB(MF, I); 2986 MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(), 2987 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 2988 } 2989 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2990 } 2991 2992 case TargetOpcode::G_PTRAUTH_GLOBAL_VALUE: 2993 return selectPtrAuthGlobalValue(I, MRI); 2994 2995 case TargetOpcode::G_ZEXTLOAD: 2996 case TargetOpcode::G_LOAD: 2997 case TargetOpcode::G_STORE: { 2998 GLoadStore &LdSt = cast<GLoadStore>(I); 2999 bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD; 3000 LLT PtrTy = MRI.getType(LdSt.getPointerReg()); 3001 3002 if (PtrTy != LLT::pointer(0, 64)) { 3003 LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy 3004 << ", expected: " << LLT::pointer(0, 64) << '\n'); 3005 return false; 3006 } 3007 3008 uint64_t MemSizeInBytes = LdSt.getMemSize().getValue(); 3009 unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue(); 3010 AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering(); 3011 3012 // Need special instructions for atomics that affect ordering. 3013 if (Order != AtomicOrdering::NotAtomic && 3014 Order != AtomicOrdering::Unordered && 3015 Order != AtomicOrdering::Monotonic) { 3016 assert(!isa<GZExtLoad>(LdSt)); 3017 assert(MemSizeInBytes <= 8 && 3018 "128-bit atomics should already be custom-legalized"); 3019 3020 if (isa<GLoad>(LdSt)) { 3021 static constexpr unsigned LDAPROpcodes[] = { 3022 AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX}; 3023 static constexpr unsigned LDAROpcodes[] = { 3024 AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX}; 3025 ArrayRef<unsigned> Opcodes = 3026 STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent 3027 ? LDAPROpcodes 3028 : LDAROpcodes; 3029 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); 3030 } else { 3031 static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH, 3032 AArch64::STLRW, AArch64::STLRX}; 3033 Register ValReg = LdSt.getReg(0); 3034 if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) { 3035 // Emit a subreg copy of 32 bits. 3036 Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3037 MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {}) 3038 .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32); 3039 I.getOperand(0).setReg(NewVal); 3040 } 3041 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); 3042 } 3043 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3044 return true; 3045 } 3046 3047 #ifndef NDEBUG 3048 const Register PtrReg = LdSt.getPointerReg(); 3049 const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); 3050 // Check that the pointer register is valid. 3051 assert(PtrRB.getID() == AArch64::GPRRegBankID && 3052 "Load/Store pointer operand isn't a GPR"); 3053 assert(MRI.getType(PtrReg).isPointer() && 3054 "Load/Store pointer operand isn't a pointer"); 3055 #endif 3056 3057 const Register ValReg = LdSt.getReg(0); 3058 const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI); 3059 LLT ValTy = MRI.getType(ValReg); 3060 3061 // The code below doesn't support truncating stores, so we need to split it 3062 // again. 3063 if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { 3064 unsigned SubReg; 3065 LLT MemTy = LdSt.getMMO().getMemoryType(); 3066 auto *RC = getRegClassForTypeOnBank(MemTy, RB); 3067 if (!getSubRegForClass(RC, TRI, SubReg)) 3068 return false; 3069 3070 // Generate a subreg copy. 3071 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {}) 3072 .addReg(ValReg, 0, SubReg) 3073 .getReg(0); 3074 RBI.constrainGenericRegister(Copy, *RC, MRI); 3075 LdSt.getOperand(0).setReg(Copy); 3076 } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { 3077 // If this is an any-extending load from the FPR bank, split it into a regular 3078 // load + extend. 3079 if (RB.getID() == AArch64::FPRRegBankID) { 3080 unsigned SubReg; 3081 LLT MemTy = LdSt.getMMO().getMemoryType(); 3082 auto *RC = getRegClassForTypeOnBank(MemTy, RB); 3083 if (!getSubRegForClass(RC, TRI, SubReg)) 3084 return false; 3085 Register OldDst = LdSt.getReg(0); 3086 Register NewDst = 3087 MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType()); 3088 LdSt.getOperand(0).setReg(NewDst); 3089 MRI.setRegBank(NewDst, RB); 3090 // Generate a SUBREG_TO_REG to extend it. 3091 MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator())); 3092 MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {}) 3093 .addImm(0) 3094 .addUse(NewDst) 3095 .addImm(SubReg); 3096 auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB); 3097 RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI); 3098 MIB.setInstr(LdSt); 3099 ValTy = MemTy; // This is no longer an extending load. 3100 } 3101 } 3102 3103 // Helper lambda for partially selecting I. Either returns the original 3104 // instruction with an updated opcode, or a new instruction. 3105 auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * { 3106 bool IsStore = isa<GStore>(I); 3107 const unsigned NewOpc = 3108 selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); 3109 if (NewOpc == I.getOpcode()) 3110 return nullptr; 3111 // Check if we can fold anything into the addressing mode. 3112 auto AddrModeFns = 3113 selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes); 3114 if (!AddrModeFns) { 3115 // Can't fold anything. Use the original instruction. 3116 I.setDesc(TII.get(NewOpc)); 3117 I.addOperand(MachineOperand::CreateImm(0)); 3118 return &I; 3119 } 3120 3121 // Folded something. Create a new instruction and return it. 3122 auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags()); 3123 Register CurValReg = I.getOperand(0).getReg(); 3124 IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg); 3125 NewInst.cloneMemRefs(I); 3126 for (auto &Fn : *AddrModeFns) 3127 Fn(NewInst); 3128 I.eraseFromParent(); 3129 return &*NewInst; 3130 }; 3131 3132 MachineInstr *LoadStore = SelectLoadStoreAddressingMode(); 3133 if (!LoadStore) 3134 return false; 3135 3136 // If we're storing a 0, use WZR/XZR. 3137 if (Opcode == TargetOpcode::G_STORE) { 3138 auto CVal = getIConstantVRegValWithLookThrough( 3139 LoadStore->getOperand(0).getReg(), MRI); 3140 if (CVal && CVal->Value == 0) { 3141 switch (LoadStore->getOpcode()) { 3142 case AArch64::STRWui: 3143 case AArch64::STRHHui: 3144 case AArch64::STRBBui: 3145 LoadStore->getOperand(0).setReg(AArch64::WZR); 3146 break; 3147 case AArch64::STRXui: 3148 LoadStore->getOperand(0).setReg(AArch64::XZR); 3149 break; 3150 } 3151 } 3152 } 3153 3154 if (IsZExtLoad || (Opcode == TargetOpcode::G_LOAD && 3155 ValTy == LLT::scalar(64) && MemSizeInBits == 32)) { 3156 // The any/zextload from a smaller type to i32 should be handled by the 3157 // importer. 3158 if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64) 3159 return false; 3160 // If we have an extending load then change the load's type to be a 3161 // narrower reg and zero_extend with SUBREG_TO_REG. 3162 Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3163 Register DstReg = LoadStore->getOperand(0).getReg(); 3164 LoadStore->getOperand(0).setReg(LdReg); 3165 3166 MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator())); 3167 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {}) 3168 .addImm(0) 3169 .addUse(LdReg) 3170 .addImm(AArch64::sub_32); 3171 constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); 3172 return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass, 3173 MRI); 3174 } 3175 return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); 3176 } 3177 3178 case TargetOpcode::G_INDEXED_ZEXTLOAD: 3179 case TargetOpcode::G_INDEXED_SEXTLOAD: 3180 return selectIndexedExtLoad(I, MRI); 3181 case TargetOpcode::G_INDEXED_LOAD: 3182 return selectIndexedLoad(I, MRI); 3183 case TargetOpcode::G_INDEXED_STORE: 3184 return selectIndexedStore(cast<GIndexedStore>(I), MRI); 3185 3186 case TargetOpcode::G_LSHR: 3187 case TargetOpcode::G_ASHR: 3188 if (MRI.getType(I.getOperand(0).getReg()).isVector()) 3189 return selectVectorAshrLshr(I, MRI); 3190 [[fallthrough]]; 3191 case TargetOpcode::G_SHL: 3192 if (Opcode == TargetOpcode::G_SHL && 3193 MRI.getType(I.getOperand(0).getReg()).isVector()) 3194 return selectVectorSHL(I, MRI); 3195 3196 // These shifts were legalized to have 64 bit shift amounts because we 3197 // want to take advantage of the selection patterns that assume the 3198 // immediates are s64s, however, selectBinaryOp will assume both operands 3199 // will have the same bit size. 3200 { 3201 Register SrcReg = I.getOperand(1).getReg(); 3202 Register ShiftReg = I.getOperand(2).getReg(); 3203 const LLT ShiftTy = MRI.getType(ShiftReg); 3204 const LLT SrcTy = MRI.getType(SrcReg); 3205 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && 3206 ShiftTy.getSizeInBits() == 64) { 3207 assert(!ShiftTy.isVector() && "unexpected vector shift ty"); 3208 // Insert a subregister copy to implement a 64->32 trunc 3209 auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) 3210 .addReg(ShiftReg, 0, AArch64::sub_32); 3211 MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); 3212 I.getOperand(2).setReg(Trunc.getReg(0)); 3213 } 3214 } 3215 [[fallthrough]]; 3216 case TargetOpcode::G_OR: { 3217 // Reject the various things we don't support yet. 3218 if (unsupportedBinOp(I, RBI, MRI, TRI)) 3219 return false; 3220 3221 const unsigned OpSize = Ty.getSizeInBits(); 3222 3223 const Register DefReg = I.getOperand(0).getReg(); 3224 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); 3225 3226 const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize); 3227 if (NewOpc == I.getOpcode()) 3228 return false; 3229 3230 I.setDesc(TII.get(NewOpc)); 3231 // FIXME: Should the type be always reset in setDesc? 3232 3233 // Now that we selected an opcode, we need to constrain the register 3234 // operands to use appropriate classes. 3235 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3236 } 3237 3238 case TargetOpcode::G_PTR_ADD: { 3239 emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB); 3240 I.eraseFromParent(); 3241 return true; 3242 } 3243 3244 case TargetOpcode::G_SADDE: 3245 case TargetOpcode::G_UADDE: 3246 case TargetOpcode::G_SSUBE: 3247 case TargetOpcode::G_USUBE: 3248 case TargetOpcode::G_SADDO: 3249 case TargetOpcode::G_UADDO: 3250 case TargetOpcode::G_SSUBO: 3251 case TargetOpcode::G_USUBO: 3252 return selectOverflowOp(I, MRI); 3253 3254 case TargetOpcode::G_PTRMASK: { 3255 Register MaskReg = I.getOperand(2).getReg(); 3256 std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI); 3257 // TODO: Implement arbitrary cases 3258 if (!MaskVal || !isShiftedMask_64(*MaskVal)) 3259 return false; 3260 3261 uint64_t Mask = *MaskVal; 3262 I.setDesc(TII.get(AArch64::ANDXri)); 3263 I.getOperand(2).ChangeToImmediate( 3264 AArch64_AM::encodeLogicalImmediate(Mask, 64)); 3265 3266 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3267 } 3268 case TargetOpcode::G_PTRTOINT: 3269 case TargetOpcode::G_TRUNC: { 3270 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3271 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); 3272 3273 const Register DstReg = I.getOperand(0).getReg(); 3274 const Register SrcReg = I.getOperand(1).getReg(); 3275 3276 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 3277 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); 3278 3279 if (DstRB.getID() != SrcRB.getID()) { 3280 LLVM_DEBUG( 3281 dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n"); 3282 return false; 3283 } 3284 3285 if (DstRB.getID() == AArch64::GPRRegBankID) { 3286 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); 3287 if (!DstRC) 3288 return false; 3289 3290 const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB); 3291 if (!SrcRC) 3292 return false; 3293 3294 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || 3295 !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { 3296 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n"); 3297 return false; 3298 } 3299 3300 if (DstRC == SrcRC) { 3301 // Nothing to be done 3302 } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) && 3303 SrcTy == LLT::scalar(64)) { 3304 llvm_unreachable("TableGen can import this case"); 3305 return false; 3306 } else if (DstRC == &AArch64::GPR32RegClass && 3307 SrcRC == &AArch64::GPR64RegClass) { 3308 I.getOperand(1).setSubReg(AArch64::sub_32); 3309 } else { 3310 LLVM_DEBUG( 3311 dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n"); 3312 return false; 3313 } 3314 3315 I.setDesc(TII.get(TargetOpcode::COPY)); 3316 return true; 3317 } else if (DstRB.getID() == AArch64::FPRRegBankID) { 3318 if (DstTy == LLT::fixed_vector(4, 16) && 3319 SrcTy == LLT::fixed_vector(4, 32)) { 3320 I.setDesc(TII.get(AArch64::XTNv4i16)); 3321 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3322 return true; 3323 } 3324 3325 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) { 3326 MachineInstr *Extract = emitExtractVectorElt( 3327 DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB); 3328 if (!Extract) 3329 return false; 3330 I.eraseFromParent(); 3331 return true; 3332 } 3333 3334 // We might have a vector G_PTRTOINT, in which case just emit a COPY. 3335 if (Opcode == TargetOpcode::G_PTRTOINT) { 3336 assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector"); 3337 I.setDesc(TII.get(TargetOpcode::COPY)); 3338 return selectCopy(I, TII, MRI, TRI, RBI); 3339 } 3340 } 3341 3342 return false; 3343 } 3344 3345 case TargetOpcode::G_ANYEXT: { 3346 if (selectUSMovFromExtend(I, MRI)) 3347 return true; 3348 3349 const Register DstReg = I.getOperand(0).getReg(); 3350 const Register SrcReg = I.getOperand(1).getReg(); 3351 3352 const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI); 3353 if (RBDst.getID() != AArch64::GPRRegBankID) { 3354 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst 3355 << ", expected: GPR\n"); 3356 return false; 3357 } 3358 3359 const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI); 3360 if (RBSrc.getID() != AArch64::GPRRegBankID) { 3361 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc 3362 << ", expected: GPR\n"); 3363 return false; 3364 } 3365 3366 const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); 3367 3368 if (DstSize == 0) { 3369 LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n"); 3370 return false; 3371 } 3372 3373 if (DstSize != 64 && DstSize > 32) { 3374 LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize 3375 << ", expected: 32 or 64\n"); 3376 return false; 3377 } 3378 // At this point G_ANYEXT is just like a plain COPY, but we need 3379 // to explicitly form the 64-bit value if any. 3380 if (DstSize > 32) { 3381 Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass); 3382 BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) 3383 .addDef(ExtSrc) 3384 .addImm(0) 3385 .addUse(SrcReg) 3386 .addImm(AArch64::sub_32); 3387 I.getOperand(1).setReg(ExtSrc); 3388 } 3389 return selectCopy(I, TII, MRI, TRI, RBI); 3390 } 3391 3392 case TargetOpcode::G_ZEXT: 3393 case TargetOpcode::G_SEXT_INREG: 3394 case TargetOpcode::G_SEXT: { 3395 if (selectUSMovFromExtend(I, MRI)) 3396 return true; 3397 3398 unsigned Opcode = I.getOpcode(); 3399 const bool IsSigned = Opcode != TargetOpcode::G_ZEXT; 3400 const Register DefReg = I.getOperand(0).getReg(); 3401 Register SrcReg = I.getOperand(1).getReg(); 3402 const LLT DstTy = MRI.getType(DefReg); 3403 const LLT SrcTy = MRI.getType(SrcReg); 3404 unsigned DstSize = DstTy.getSizeInBits(); 3405 unsigned SrcSize = SrcTy.getSizeInBits(); 3406 3407 // SEXT_INREG has the same src reg size as dst, the size of the value to be 3408 // extended is encoded in the imm. 3409 if (Opcode == TargetOpcode::G_SEXT_INREG) 3410 SrcSize = I.getOperand(2).getImm(); 3411 3412 if (DstTy.isVector()) 3413 return false; // Should be handled by imported patterns. 3414 3415 assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() == 3416 AArch64::GPRRegBankID && 3417 "Unexpected ext regbank"); 3418 3419 MachineInstr *ExtI; 3420 3421 // First check if we're extending the result of a load which has a dest type 3422 // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest 3423 // GPR register on AArch64 and all loads which are smaller automatically 3424 // zero-extend the upper bits. E.g. 3425 // %v(s8) = G_LOAD %p, :: (load 1) 3426 // %v2(s32) = G_ZEXT %v(s8) 3427 if (!IsSigned) { 3428 auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI); 3429 bool IsGPR = 3430 RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID; 3431 if (LoadMI && IsGPR) { 3432 const MachineMemOperand *MemOp = *LoadMI->memoperands_begin(); 3433 unsigned BytesLoaded = MemOp->getSize().getValue(); 3434 if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded) 3435 return selectCopy(I, TII, MRI, TRI, RBI); 3436 } 3437 3438 // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs) 3439 // + SUBREG_TO_REG. 3440 if (IsGPR && SrcSize == 32 && DstSize == 64) { 3441 Register SubregToRegSrc = 3442 MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3443 const Register ZReg = AArch64::WZR; 3444 MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg}) 3445 .addImm(0); 3446 3447 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) 3448 .addImm(0) 3449 .addUse(SubregToRegSrc) 3450 .addImm(AArch64::sub_32); 3451 3452 if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, 3453 MRI)) { 3454 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n"); 3455 return false; 3456 } 3457 3458 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, 3459 MRI)) { 3460 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n"); 3461 return false; 3462 } 3463 3464 I.eraseFromParent(); 3465 return true; 3466 } 3467 } 3468 3469 if (DstSize == 64) { 3470 if (Opcode != TargetOpcode::G_SEXT_INREG) { 3471 // FIXME: Can we avoid manually doing this? 3472 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, 3473 MRI)) { 3474 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) 3475 << " operand\n"); 3476 return false; 3477 } 3478 SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, 3479 {&AArch64::GPR64RegClass}, {}) 3480 .addImm(0) 3481 .addUse(SrcReg) 3482 .addImm(AArch64::sub_32) 3483 .getReg(0); 3484 } 3485 3486 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri, 3487 {DefReg}, {SrcReg}) 3488 .addImm(0) 3489 .addImm(SrcSize - 1); 3490 } else if (DstSize <= 32) { 3491 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri, 3492 {DefReg}, {SrcReg}) 3493 .addImm(0) 3494 .addImm(SrcSize - 1); 3495 } else { 3496 return false; 3497 } 3498 3499 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 3500 I.eraseFromParent(); 3501 return true; 3502 } 3503 3504 case TargetOpcode::G_SITOFP: 3505 case TargetOpcode::G_UITOFP: 3506 case TargetOpcode::G_FPTOSI: 3507 case TargetOpcode::G_FPTOUI: { 3508 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()), 3509 SrcTy = MRI.getType(I.getOperand(1).getReg()); 3510 const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy); 3511 if (NewOpc == Opcode) 3512 return false; 3513 3514 I.setDesc(TII.get(NewOpc)); 3515 constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3516 I.setFlags(MachineInstr::NoFPExcept); 3517 3518 return true; 3519 } 3520 3521 case TargetOpcode::G_FREEZE: 3522 return selectCopy(I, TII, MRI, TRI, RBI); 3523 3524 case TargetOpcode::G_INTTOPTR: 3525 // The importer is currently unable to import pointer types since they 3526 // didn't exist in SelectionDAG. 3527 return selectCopy(I, TII, MRI, TRI, RBI); 3528 3529 case TargetOpcode::G_BITCAST: 3530 // Imported SelectionDAG rules can handle every bitcast except those that 3531 // bitcast from a type to the same type. Ideally, these shouldn't occur 3532 // but we might not run an optimizer that deletes them. The other exception 3533 // is bitcasts involving pointer types, as SelectionDAG has no knowledge 3534 // of them. 3535 return selectCopy(I, TII, MRI, TRI, RBI); 3536 3537 case TargetOpcode::G_SELECT: { 3538 auto &Sel = cast<GSelect>(I); 3539 const Register CondReg = Sel.getCondReg(); 3540 const Register TReg = Sel.getTrueReg(); 3541 const Register FReg = Sel.getFalseReg(); 3542 3543 if (tryOptSelect(Sel)) 3544 return true; 3545 3546 // Make sure to use an unused vreg instead of wzr, so that the peephole 3547 // optimizations will be able to optimize these. 3548 Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3549 auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg}) 3550 .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); 3551 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 3552 if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB)) 3553 return false; 3554 Sel.eraseFromParent(); 3555 return true; 3556 } 3557 case TargetOpcode::G_ICMP: { 3558 if (Ty.isVector()) 3559 return false; 3560 3561 if (Ty != LLT::scalar(32)) { 3562 LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty 3563 << ", expected: " << LLT::scalar(32) << '\n'); 3564 return false; 3565 } 3566 3567 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); 3568 const AArch64CC::CondCode InvCC = 3569 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); 3570 emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB); 3571 emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR, 3572 /*Src2=*/AArch64::WZR, InvCC, MIB); 3573 I.eraseFromParent(); 3574 return true; 3575 } 3576 3577 case TargetOpcode::G_FCMP: { 3578 CmpInst::Predicate Pred = 3579 static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); 3580 if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB, 3581 Pred) || 3582 !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB)) 3583 return false; 3584 I.eraseFromParent(); 3585 return true; 3586 } 3587 case TargetOpcode::G_VASTART: 3588 return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) 3589 : selectVaStartAAPCS(I, MF, MRI); 3590 case TargetOpcode::G_INTRINSIC: 3591 return selectIntrinsic(I, MRI); 3592 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 3593 return selectIntrinsicWithSideEffects(I, MRI); 3594 case TargetOpcode::G_IMPLICIT_DEF: { 3595 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 3596 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3597 const Register DstReg = I.getOperand(0).getReg(); 3598 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 3599 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); 3600 RBI.constrainGenericRegister(DstReg, *DstRC, MRI); 3601 return true; 3602 } 3603 case TargetOpcode::G_BLOCK_ADDR: { 3604 Function *BAFn = I.getOperand(1).getBlockAddress()->getFunction(); 3605 if (std::optional<uint16_t> BADisc = 3606 STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(*BAFn)) { 3607 MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {}); 3608 MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {}); 3609 MIB.buildInstr(AArch64::MOVaddrPAC) 3610 .addBlockAddress(I.getOperand(1).getBlockAddress()) 3611 .addImm(AArch64PACKey::IA) 3612 .addReg(/*AddrDisc=*/AArch64::XZR) 3613 .addImm(*BADisc) 3614 .constrainAllUses(TII, TRI, RBI); 3615 MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X16)); 3616 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 3617 AArch64::GPR64RegClass, MRI); 3618 I.eraseFromParent(); 3619 return true; 3620 } 3621 if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) { 3622 materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0); 3623 I.eraseFromParent(); 3624 return true; 3625 } else { 3626 I.setDesc(TII.get(AArch64::MOVaddrBA)); 3627 auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA), 3628 I.getOperand(0).getReg()) 3629 .addBlockAddress(I.getOperand(1).getBlockAddress(), 3630 /* Offset */ 0, AArch64II::MO_PAGE) 3631 .addBlockAddress( 3632 I.getOperand(1).getBlockAddress(), /* Offset */ 0, 3633 AArch64II::MO_NC | AArch64II::MO_PAGEOFF); 3634 I.eraseFromParent(); 3635 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); 3636 } 3637 } 3638 case AArch64::G_DUP: { 3639 // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by 3640 // imported patterns. Do it manually here. Avoiding generating s16 gpr is 3641 // difficult because at RBS we may end up pessimizing the fpr case if we 3642 // decided to add an anyextend to fix this. Manual selection is the most 3643 // robust solution for now. 3644 if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != 3645 AArch64::GPRRegBankID) 3646 return false; // We expect the fpr regbank case to be imported. 3647 LLT VecTy = MRI.getType(I.getOperand(0).getReg()); 3648 if (VecTy == LLT::fixed_vector(8, 8)) 3649 I.setDesc(TII.get(AArch64::DUPv8i8gpr)); 3650 else if (VecTy == LLT::fixed_vector(16, 8)) 3651 I.setDesc(TII.get(AArch64::DUPv16i8gpr)); 3652 else if (VecTy == LLT::fixed_vector(4, 16)) 3653 I.setDesc(TII.get(AArch64::DUPv4i16gpr)); 3654 else if (VecTy == LLT::fixed_vector(8, 16)) 3655 I.setDesc(TII.get(AArch64::DUPv8i16gpr)); 3656 else 3657 return false; 3658 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 3659 } 3660 case TargetOpcode::G_BUILD_VECTOR: 3661 return selectBuildVector(I, MRI); 3662 case TargetOpcode::G_MERGE_VALUES: 3663 return selectMergeValues(I, MRI); 3664 case TargetOpcode::G_UNMERGE_VALUES: 3665 return selectUnmergeValues(I, MRI); 3666 case TargetOpcode::G_SHUFFLE_VECTOR: 3667 return selectShuffleVector(I, MRI); 3668 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 3669 return selectExtractElt(I, MRI); 3670 case TargetOpcode::G_CONCAT_VECTORS: 3671 return selectConcatVectors(I, MRI); 3672 case TargetOpcode::G_JUMP_TABLE: 3673 return selectJumpTable(I, MRI); 3674 case TargetOpcode::G_MEMCPY: 3675 case TargetOpcode::G_MEMCPY_INLINE: 3676 case TargetOpcode::G_MEMMOVE: 3677 case TargetOpcode::G_MEMSET: 3678 assert(STI.hasMOPS() && "Shouldn't get here without +mops feature"); 3679 return selectMOPS(I, MRI); 3680 } 3681 3682 return false; 3683 } 3684 3685 bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) { 3686 MachineIRBuilderState OldMIBState = MIB.getState(); 3687 bool Success = select(I); 3688 MIB.setState(OldMIBState); 3689 return Success; 3690 } 3691 3692 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI, 3693 MachineRegisterInfo &MRI) { 3694 unsigned Mopcode; 3695 switch (GI.getOpcode()) { 3696 case TargetOpcode::G_MEMCPY: 3697 case TargetOpcode::G_MEMCPY_INLINE: 3698 Mopcode = AArch64::MOPSMemoryCopyPseudo; 3699 break; 3700 case TargetOpcode::G_MEMMOVE: 3701 Mopcode = AArch64::MOPSMemoryMovePseudo; 3702 break; 3703 case TargetOpcode::G_MEMSET: 3704 // For tagged memset see llvm.aarch64.mops.memset.tag 3705 Mopcode = AArch64::MOPSMemorySetPseudo; 3706 break; 3707 } 3708 3709 auto &DstPtr = GI.getOperand(0); 3710 auto &SrcOrVal = GI.getOperand(1); 3711 auto &Size = GI.getOperand(2); 3712 3713 // Create copies of the registers that can be clobbered. 3714 const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg()); 3715 const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg()); 3716 const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg()); 3717 3718 const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo; 3719 const auto &SrcValRegClass = 3720 IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass; 3721 3722 // Constrain to specific registers 3723 RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI); 3724 RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI); 3725 RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI); 3726 3727 MIB.buildCopy(DstPtrCopy, DstPtr); 3728 MIB.buildCopy(SrcValCopy, SrcOrVal); 3729 MIB.buildCopy(SizeCopy, Size); 3730 3731 // New instruction uses the copied registers because it must update them. 3732 // The defs are not used since they don't exist in G_MEM*. They are still 3733 // tied. 3734 // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE 3735 Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); 3736 Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 3737 if (IsSet) { 3738 MIB.buildInstr(Mopcode, {DefDstPtr, DefSize}, 3739 {DstPtrCopy, SizeCopy, SrcValCopy}); 3740 } else { 3741 Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass); 3742 MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize}, 3743 {DstPtrCopy, SrcValCopy, SizeCopy}); 3744 } 3745 3746 GI.eraseFromParent(); 3747 return true; 3748 } 3749 3750 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, 3751 MachineRegisterInfo &MRI) { 3752 assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); 3753 Register JTAddr = I.getOperand(0).getReg(); 3754 unsigned JTI = I.getOperand(1).getIndex(); 3755 Register Index = I.getOperand(2).getReg(); 3756 3757 MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr); 3758 3759 // With aarch64-jump-table-hardening, we only expand the jump table dispatch 3760 // sequence later, to guarantee the integrity of the intermediate values. 3761 if (MF->getFunction().hasFnAttribute("aarch64-jump-table-hardening")) { 3762 CodeModel::Model CM = TM.getCodeModel(); 3763 if (STI.isTargetMachO()) { 3764 if (CM != CodeModel::Small && CM != CodeModel::Large) 3765 report_fatal_error("Unsupported code-model for hardened jump-table"); 3766 } else { 3767 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO. 3768 assert(STI.isTargetELF() && 3769 "jump table hardening only supported on MachO/ELF"); 3770 if (CM != CodeModel::Small) 3771 report_fatal_error("Unsupported code-model for hardened jump-table"); 3772 } 3773 3774 MIB.buildCopy({AArch64::X16}, I.getOperand(2).getReg()); 3775 MIB.buildInstr(AArch64::BR_JumpTable) 3776 .addJumpTableIndex(I.getOperand(1).getIndex()); 3777 I.eraseFromParent(); 3778 return true; 3779 } 3780 3781 Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 3782 Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); 3783 3784 auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32, 3785 {TargetReg, ScratchReg}, {JTAddr, Index}) 3786 .addJumpTableIndex(JTI); 3787 // Save the jump table info. 3788 MIB.buildInstr(TargetOpcode::JUMP_TABLE_DEBUG_INFO, {}, 3789 {static_cast<int64_t>(JTI)}); 3790 // Build the indirect branch. 3791 MIB.buildInstr(AArch64::BR, {}, {TargetReg}); 3792 I.eraseFromParent(); 3793 return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI); 3794 } 3795 3796 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I, 3797 MachineRegisterInfo &MRI) { 3798 assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table"); 3799 assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!"); 3800 3801 Register DstReg = I.getOperand(0).getReg(); 3802 unsigned JTI = I.getOperand(1).getIndex(); 3803 // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later. 3804 auto MovMI = 3805 MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {}) 3806 .addJumpTableIndex(JTI, AArch64II::MO_PAGE) 3807 .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF); 3808 I.eraseFromParent(); 3809 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); 3810 } 3811 3812 bool AArch64InstructionSelector::selectTLSGlobalValue( 3813 MachineInstr &I, MachineRegisterInfo &MRI) { 3814 if (!STI.isTargetMachO()) 3815 return false; 3816 MachineFunction &MF = *I.getParent()->getParent(); 3817 MF.getFrameInfo().setAdjustsStack(true); 3818 3819 const auto &GlobalOp = I.getOperand(1); 3820 assert(GlobalOp.getOffset() == 0 && 3821 "Shouldn't have an offset on TLS globals!"); 3822 const GlobalValue &GV = *GlobalOp.getGlobal(); 3823 3824 auto LoadGOT = 3825 MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {}) 3826 .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); 3827 3828 auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass}, 3829 {LoadGOT.getReg(0)}) 3830 .addImm(0); 3831 3832 MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0)); 3833 // TLS calls preserve all registers except those that absolutely must be 3834 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 3835 // silly). 3836 unsigned Opcode = getBLRCallOpcode(MF); 3837 3838 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0). 3839 if (MF.getFunction().hasFnAttribute("ptrauth-calls")) { 3840 assert(Opcode == AArch64::BLR); 3841 Opcode = AArch64::BLRAAZ; 3842 } 3843 3844 MIB.buildInstr(Opcode, {}, {Load}) 3845 .addUse(AArch64::X0, RegState::Implicit) 3846 .addDef(AArch64::X0, RegState::Implicit) 3847 .addRegMask(TRI.getTLSCallPreservedMask()); 3848 3849 MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0)); 3850 RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass, 3851 MRI); 3852 I.eraseFromParent(); 3853 return true; 3854 } 3855 3856 MachineInstr *AArch64InstructionSelector::emitScalarToVector( 3857 unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar, 3858 MachineIRBuilder &MIRBuilder) const { 3859 auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {}); 3860 3861 auto BuildFn = [&](unsigned SubregIndex) { 3862 auto Ins = 3863 MIRBuilder 3864 .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar}) 3865 .addImm(SubregIndex); 3866 constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI); 3867 constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI); 3868 return &*Ins; 3869 }; 3870 3871 switch (EltSize) { 3872 case 8: 3873 return BuildFn(AArch64::bsub); 3874 case 16: 3875 return BuildFn(AArch64::hsub); 3876 case 32: 3877 return BuildFn(AArch64::ssub); 3878 case 64: 3879 return BuildFn(AArch64::dsub); 3880 default: 3881 return nullptr; 3882 } 3883 } 3884 3885 MachineInstr * 3886 AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg, 3887 MachineIRBuilder &MIB, 3888 MachineRegisterInfo &MRI) const { 3889 LLT DstTy = MRI.getType(DstReg); 3890 const TargetRegisterClass *RC = 3891 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI)); 3892 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { 3893 LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); 3894 return nullptr; 3895 } 3896 unsigned SubReg = 0; 3897 if (!getSubRegForClass(RC, TRI, SubReg)) 3898 return nullptr; 3899 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { 3900 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" 3901 << DstTy.getSizeInBits() << "\n"); 3902 return nullptr; 3903 } 3904 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) 3905 .addReg(SrcReg, 0, SubReg); 3906 RBI.constrainGenericRegister(DstReg, *RC, MRI); 3907 return Copy; 3908 } 3909 3910 bool AArch64InstructionSelector::selectMergeValues( 3911 MachineInstr &I, MachineRegisterInfo &MRI) { 3912 assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode"); 3913 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 3914 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); 3915 assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation"); 3916 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); 3917 3918 if (I.getNumOperands() != 3) 3919 return false; 3920 3921 // Merging 2 s64s into an s128. 3922 if (DstTy == LLT::scalar(128)) { 3923 if (SrcTy.getSizeInBits() != 64) 3924 return false; 3925 Register DstReg = I.getOperand(0).getReg(); 3926 Register Src1Reg = I.getOperand(1).getReg(); 3927 Register Src2Reg = I.getOperand(2).getReg(); 3928 auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {}); 3929 MachineInstr *InsMI = emitLaneInsert(std::nullopt, Tmp.getReg(0), Src1Reg, 3930 /* LaneIdx */ 0, RB, MIB); 3931 if (!InsMI) 3932 return false; 3933 MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(), 3934 Src2Reg, /* LaneIdx */ 1, RB, MIB); 3935 if (!Ins2MI) 3936 return false; 3937 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); 3938 constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI); 3939 I.eraseFromParent(); 3940 return true; 3941 } 3942 3943 if (RB.getID() != AArch64::GPRRegBankID) 3944 return false; 3945 3946 if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) 3947 return false; 3948 3949 auto *DstRC = &AArch64::GPR64RegClass; 3950 Register SubToRegDef = MRI.createVirtualRegister(DstRC); 3951 MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), 3952 TII.get(TargetOpcode::SUBREG_TO_REG)) 3953 .addDef(SubToRegDef) 3954 .addImm(0) 3955 .addUse(I.getOperand(1).getReg()) 3956 .addImm(AArch64::sub_32); 3957 Register SubToRegDef2 = MRI.createVirtualRegister(DstRC); 3958 // Need to anyext the second scalar before we can use bfm 3959 MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), 3960 TII.get(TargetOpcode::SUBREG_TO_REG)) 3961 .addDef(SubToRegDef2) 3962 .addImm(0) 3963 .addUse(I.getOperand(2).getReg()) 3964 .addImm(AArch64::sub_32); 3965 MachineInstr &BFM = 3966 *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri)) 3967 .addDef(I.getOperand(0).getReg()) 3968 .addUse(SubToRegDef) 3969 .addUse(SubToRegDef2) 3970 .addImm(32) 3971 .addImm(31); 3972 constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI); 3973 constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI); 3974 constrainSelectedInstRegOperands(BFM, TII, TRI, RBI); 3975 I.eraseFromParent(); 3976 return true; 3977 } 3978 3979 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg, 3980 const unsigned EltSize) { 3981 // Choose a lane copy opcode and subregister based off of the size of the 3982 // vector's elements. 3983 switch (EltSize) { 3984 case 8: 3985 CopyOpc = AArch64::DUPi8; 3986 ExtractSubReg = AArch64::bsub; 3987 break; 3988 case 16: 3989 CopyOpc = AArch64::DUPi16; 3990 ExtractSubReg = AArch64::hsub; 3991 break; 3992 case 32: 3993 CopyOpc = AArch64::DUPi32; 3994 ExtractSubReg = AArch64::ssub; 3995 break; 3996 case 64: 3997 CopyOpc = AArch64::DUPi64; 3998 ExtractSubReg = AArch64::dsub; 3999 break; 4000 default: 4001 // Unknown size, bail out. 4002 LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n"); 4003 return false; 4004 } 4005 return true; 4006 } 4007 4008 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( 4009 std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy, 4010 Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const { 4011 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4012 unsigned CopyOpc = 0; 4013 unsigned ExtractSubReg = 0; 4014 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) { 4015 LLVM_DEBUG( 4016 dbgs() << "Couldn't determine lane copy opcode for instruction.\n"); 4017 return nullptr; 4018 } 4019 4020 const TargetRegisterClass *DstRC = 4021 getRegClassForTypeOnBank(ScalarTy, DstRB, true); 4022 if (!DstRC) { 4023 LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n"); 4024 return nullptr; 4025 } 4026 4027 const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI); 4028 const LLT &VecTy = MRI.getType(VecReg); 4029 const TargetRegisterClass *VecRC = 4030 getRegClassForTypeOnBank(VecTy, VecRB, true); 4031 if (!VecRC) { 4032 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); 4033 return nullptr; 4034 } 4035 4036 // The register that we're going to copy into. 4037 Register InsertReg = VecReg; 4038 if (!DstReg) 4039 DstReg = MRI.createVirtualRegister(DstRC); 4040 // If the lane index is 0, we just use a subregister COPY. 4041 if (LaneIdx == 0) { 4042 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {}) 4043 .addReg(VecReg, 0, ExtractSubReg); 4044 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); 4045 return &*Copy; 4046 } 4047 4048 // Lane copies require 128-bit wide registers. If we're dealing with an 4049 // unpacked vector, then we need to move up to that width. Insert an implicit 4050 // def and a subregister insert to get us there. 4051 if (VecTy.getSizeInBits() != 128) { 4052 MachineInstr *ScalarToVector = emitScalarToVector( 4053 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder); 4054 if (!ScalarToVector) 4055 return nullptr; 4056 InsertReg = ScalarToVector->getOperand(0).getReg(); 4057 } 4058 4059 MachineInstr *LaneCopyMI = 4060 MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx); 4061 constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI); 4062 4063 // Make sure that we actually constrain the initial copy. 4064 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI); 4065 return LaneCopyMI; 4066 } 4067 4068 bool AArch64InstructionSelector::selectExtractElt( 4069 MachineInstr &I, MachineRegisterInfo &MRI) { 4070 assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT && 4071 "unexpected opcode!"); 4072 Register DstReg = I.getOperand(0).getReg(); 4073 const LLT NarrowTy = MRI.getType(DstReg); 4074 const Register SrcReg = I.getOperand(1).getReg(); 4075 const LLT WideTy = MRI.getType(SrcReg); 4076 (void)WideTy; 4077 assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() && 4078 "source register size too small!"); 4079 assert(!NarrowTy.isVector() && "cannot extract vector into vector!"); 4080 4081 // Need the lane index to determine the correct copy opcode. 4082 MachineOperand &LaneIdxOp = I.getOperand(2); 4083 assert(LaneIdxOp.isReg() && "Lane index operand was not a register?"); 4084 4085 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { 4086 LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n"); 4087 return false; 4088 } 4089 4090 // Find the index to extract from. 4091 auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI); 4092 if (!VRegAndVal) 4093 return false; 4094 unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); 4095 4096 4097 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); 4098 MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg, 4099 LaneIdx, MIB); 4100 if (!Extract) 4101 return false; 4102 4103 I.eraseFromParent(); 4104 return true; 4105 } 4106 4107 bool AArch64InstructionSelector::selectSplitVectorUnmerge( 4108 MachineInstr &I, MachineRegisterInfo &MRI) { 4109 unsigned NumElts = I.getNumOperands() - 1; 4110 Register SrcReg = I.getOperand(NumElts).getReg(); 4111 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); 4112 const LLT SrcTy = MRI.getType(SrcReg); 4113 4114 assert(NarrowTy.isVector() && "Expected an unmerge into vectors"); 4115 if (SrcTy.getSizeInBits() > 128) { 4116 LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge"); 4117 return false; 4118 } 4119 4120 // We implement a split vector operation by treating the sub-vectors as 4121 // scalars and extracting them. 4122 const RegisterBank &DstRB = 4123 *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI); 4124 for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) { 4125 Register Dst = I.getOperand(OpIdx).getReg(); 4126 MachineInstr *Extract = 4127 emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB); 4128 if (!Extract) 4129 return false; 4130 } 4131 I.eraseFromParent(); 4132 return true; 4133 } 4134 4135 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I, 4136 MachineRegisterInfo &MRI) { 4137 assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && 4138 "unexpected opcode"); 4139 4140 // TODO: Handle unmerging into GPRs and from scalars to scalars. 4141 if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() != 4142 AArch64::FPRRegBankID || 4143 RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() != 4144 AArch64::FPRRegBankID) { 4145 LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar " 4146 "currently unsupported.\n"); 4147 return false; 4148 } 4149 4150 // The last operand is the vector source register, and every other operand is 4151 // a register to unpack into. 4152 unsigned NumElts = I.getNumOperands() - 1; 4153 Register SrcReg = I.getOperand(NumElts).getReg(); 4154 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); 4155 const LLT WideTy = MRI.getType(SrcReg); 4156 (void)WideTy; 4157 assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) && 4158 "can only unmerge from vector or s128 types!"); 4159 assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && 4160 "source register size too small!"); 4161 4162 if (!NarrowTy.isScalar()) 4163 return selectSplitVectorUnmerge(I, MRI); 4164 4165 // Choose a lane copy opcode and subregister based off of the size of the 4166 // vector's elements. 4167 unsigned CopyOpc = 0; 4168 unsigned ExtractSubReg = 0; 4169 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits())) 4170 return false; 4171 4172 // Set up for the lane copies. 4173 MachineBasicBlock &MBB = *I.getParent(); 4174 4175 // Stores the registers we'll be copying from. 4176 SmallVector<Register, 4> InsertRegs; 4177 4178 // We'll use the first register twice, so we only need NumElts-1 registers. 4179 unsigned NumInsertRegs = NumElts - 1; 4180 4181 // If our elements fit into exactly 128 bits, then we can copy from the source 4182 // directly. Otherwise, we need to do a bit of setup with some subregister 4183 // inserts. 4184 if (NarrowTy.getSizeInBits() * NumElts == 128) { 4185 InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg); 4186 } else { 4187 // No. We have to perform subregister inserts. For each insert, create an 4188 // implicit def and a subregister insert, and save the register we create. 4189 const TargetRegisterClass *RC = getRegClassForTypeOnBank( 4190 LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()), 4191 *RBI.getRegBank(SrcReg, MRI, TRI)); 4192 unsigned SubReg = 0; 4193 bool Found = getSubRegForClass(RC, TRI, SubReg); 4194 (void)Found; 4195 assert(Found && "expected to find last operand's subeg idx"); 4196 for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) { 4197 Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 4198 MachineInstr &ImpDefMI = 4199 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF), 4200 ImpDefReg); 4201 4202 // Now, create the subregister insert from SrcReg. 4203 Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 4204 MachineInstr &InsMI = 4205 *BuildMI(MBB, I, I.getDebugLoc(), 4206 TII.get(TargetOpcode::INSERT_SUBREG), InsertReg) 4207 .addUse(ImpDefReg) 4208 .addUse(SrcReg) 4209 .addImm(SubReg); 4210 4211 constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); 4212 constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); 4213 4214 // Save the register so that we can copy from it after. 4215 InsertRegs.push_back(InsertReg); 4216 } 4217 } 4218 4219 // Now that we've created any necessary subregister inserts, we can 4220 // create the copies. 4221 // 4222 // Perform the first copy separately as a subregister copy. 4223 Register CopyTo = I.getOperand(0).getReg(); 4224 auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {}) 4225 .addReg(InsertRegs[0], 0, ExtractSubReg); 4226 constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI); 4227 4228 // Now, perform the remaining copies as vector lane copies. 4229 unsigned LaneIdx = 1; 4230 for (Register InsReg : InsertRegs) { 4231 Register CopyTo = I.getOperand(LaneIdx).getReg(); 4232 MachineInstr &CopyInst = 4233 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo) 4234 .addUse(InsReg) 4235 .addImm(LaneIdx); 4236 constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI); 4237 ++LaneIdx; 4238 } 4239 4240 // Separately constrain the first copy's destination. Because of the 4241 // limitation in constrainOperandRegClass, we can't guarantee that this will 4242 // actually be constrained. So, do it ourselves using the second operand. 4243 const TargetRegisterClass *RC = 4244 MRI.getRegClassOrNull(I.getOperand(1).getReg()); 4245 if (!RC) { 4246 LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n"); 4247 return false; 4248 } 4249 4250 RBI.constrainGenericRegister(CopyTo, *RC, MRI); 4251 I.eraseFromParent(); 4252 return true; 4253 } 4254 4255 bool AArch64InstructionSelector::selectConcatVectors( 4256 MachineInstr &I, MachineRegisterInfo &MRI) { 4257 assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS && 4258 "Unexpected opcode"); 4259 Register Dst = I.getOperand(0).getReg(); 4260 Register Op1 = I.getOperand(1).getReg(); 4261 Register Op2 = I.getOperand(2).getReg(); 4262 MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB); 4263 if (!ConcatMI) 4264 return false; 4265 I.eraseFromParent(); 4266 return true; 4267 } 4268 4269 unsigned 4270 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal, 4271 MachineFunction &MF) const { 4272 Type *CPTy = CPVal->getType(); 4273 Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy); 4274 4275 MachineConstantPool *MCP = MF.getConstantPool(); 4276 return MCP->getConstantPoolIndex(CPVal, Alignment); 4277 } 4278 4279 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool( 4280 const Constant *CPVal, MachineIRBuilder &MIRBuilder) const { 4281 const TargetRegisterClass *RC; 4282 unsigned Opc; 4283 bool IsTiny = TM.getCodeModel() == CodeModel::Tiny; 4284 unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType()); 4285 switch (Size) { 4286 case 16: 4287 RC = &AArch64::FPR128RegClass; 4288 Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui; 4289 break; 4290 case 8: 4291 RC = &AArch64::FPR64RegClass; 4292 Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui; 4293 break; 4294 case 4: 4295 RC = &AArch64::FPR32RegClass; 4296 Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui; 4297 break; 4298 case 2: 4299 RC = &AArch64::FPR16RegClass; 4300 Opc = AArch64::LDRHui; 4301 break; 4302 default: 4303 LLVM_DEBUG(dbgs() << "Could not load from constant pool of type " 4304 << *CPVal->getType()); 4305 return nullptr; 4306 } 4307 4308 MachineInstr *LoadMI = nullptr; 4309 auto &MF = MIRBuilder.getMF(); 4310 unsigned CPIdx = emitConstantPoolEntry(CPVal, MF); 4311 if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) { 4312 // Use load(literal) for tiny code model. 4313 LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {}).addConstantPoolIndex(CPIdx); 4314 } else { 4315 auto Adrp = 4316 MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {}) 4317 .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE); 4318 4319 LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {Adrp}) 4320 .addConstantPoolIndex( 4321 CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4322 4323 constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI); 4324 } 4325 4326 MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF); 4327 LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo, 4328 MachineMemOperand::MOLoad, 4329 Size, Align(Size))); 4330 constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI); 4331 return LoadMI; 4332 } 4333 4334 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given 4335 /// size and RB. 4336 static std::pair<unsigned, unsigned> 4337 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { 4338 unsigned Opc, SubregIdx; 4339 if (RB.getID() == AArch64::GPRRegBankID) { 4340 if (EltSize == 8) { 4341 Opc = AArch64::INSvi8gpr; 4342 SubregIdx = AArch64::bsub; 4343 } else if (EltSize == 16) { 4344 Opc = AArch64::INSvi16gpr; 4345 SubregIdx = AArch64::ssub; 4346 } else if (EltSize == 32) { 4347 Opc = AArch64::INSvi32gpr; 4348 SubregIdx = AArch64::ssub; 4349 } else if (EltSize == 64) { 4350 Opc = AArch64::INSvi64gpr; 4351 SubregIdx = AArch64::dsub; 4352 } else { 4353 llvm_unreachable("invalid elt size!"); 4354 } 4355 } else { 4356 if (EltSize == 8) { 4357 Opc = AArch64::INSvi8lane; 4358 SubregIdx = AArch64::bsub; 4359 } else if (EltSize == 16) { 4360 Opc = AArch64::INSvi16lane; 4361 SubregIdx = AArch64::hsub; 4362 } else if (EltSize == 32) { 4363 Opc = AArch64::INSvi32lane; 4364 SubregIdx = AArch64::ssub; 4365 } else if (EltSize == 64) { 4366 Opc = AArch64::INSvi64lane; 4367 SubregIdx = AArch64::dsub; 4368 } else { 4369 llvm_unreachable("invalid elt size!"); 4370 } 4371 } 4372 return std::make_pair(Opc, SubregIdx); 4373 } 4374 4375 MachineInstr *AArch64InstructionSelector::emitInstr( 4376 unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps, 4377 std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder, 4378 const ComplexRendererFns &RenderFns) const { 4379 assert(Opcode && "Expected an opcode?"); 4380 assert(!isPreISelGenericOpcode(Opcode) && 4381 "Function should only be used to produce selected instructions!"); 4382 auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps); 4383 if (RenderFns) 4384 for (auto &Fn : *RenderFns) 4385 Fn(MI); 4386 constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); 4387 return &*MI; 4388 } 4389 4390 MachineInstr *AArch64InstructionSelector::emitAddSub( 4391 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, 4392 Register Dst, MachineOperand &LHS, MachineOperand &RHS, 4393 MachineIRBuilder &MIRBuilder) const { 4394 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4395 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4396 auto Ty = MRI.getType(LHS.getReg()); 4397 assert(!Ty.isVector() && "Expected a scalar or pointer?"); 4398 unsigned Size = Ty.getSizeInBits(); 4399 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only"); 4400 bool Is32Bit = Size == 32; 4401 4402 // INSTRri form with positive arithmetic immediate. 4403 if (auto Fns = selectArithImmed(RHS)) 4404 return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS}, 4405 MIRBuilder, Fns); 4406 4407 // INSTRri form with negative arithmetic immediate. 4408 if (auto Fns = selectNegArithImmed(RHS)) 4409 return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS}, 4410 MIRBuilder, Fns); 4411 4412 // INSTRrx form. 4413 if (auto Fns = selectArithExtendedRegister(RHS)) 4414 return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS}, 4415 MIRBuilder, Fns); 4416 4417 // INSTRrs form. 4418 if (auto Fns = selectShiftedRegister(RHS)) 4419 return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS}, 4420 MIRBuilder, Fns); 4421 return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS}, 4422 MIRBuilder); 4423 } 4424 4425 MachineInstr * 4426 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, 4427 MachineOperand &RHS, 4428 MachineIRBuilder &MIRBuilder) const { 4429 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4430 {{AArch64::ADDXri, AArch64::ADDWri}, 4431 {AArch64::ADDXrs, AArch64::ADDWrs}, 4432 {AArch64::ADDXrr, AArch64::ADDWrr}, 4433 {AArch64::SUBXri, AArch64::SUBWri}, 4434 {AArch64::ADDXrx, AArch64::ADDWrx}}}; 4435 return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder); 4436 } 4437 4438 MachineInstr * 4439 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS, 4440 MachineOperand &RHS, 4441 MachineIRBuilder &MIRBuilder) const { 4442 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4443 {{AArch64::ADDSXri, AArch64::ADDSWri}, 4444 {AArch64::ADDSXrs, AArch64::ADDSWrs}, 4445 {AArch64::ADDSXrr, AArch64::ADDSWrr}, 4446 {AArch64::SUBSXri, AArch64::SUBSWri}, 4447 {AArch64::ADDSXrx, AArch64::ADDSWrx}}}; 4448 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); 4449 } 4450 4451 MachineInstr * 4452 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS, 4453 MachineOperand &RHS, 4454 MachineIRBuilder &MIRBuilder) const { 4455 const std::array<std::array<unsigned, 2>, 5> OpcTable{ 4456 {{AArch64::SUBSXri, AArch64::SUBSWri}, 4457 {AArch64::SUBSXrs, AArch64::SUBSWrs}, 4458 {AArch64::SUBSXrr, AArch64::SUBSWrr}, 4459 {AArch64::ADDSXri, AArch64::ADDSWri}, 4460 {AArch64::SUBSXrx, AArch64::SUBSWrx}}}; 4461 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); 4462 } 4463 4464 MachineInstr * 4465 AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS, 4466 MachineOperand &RHS, 4467 MachineIRBuilder &MIRBuilder) const { 4468 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4469 MachineRegisterInfo *MRI = MIRBuilder.getMRI(); 4470 bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32); 4471 static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr}; 4472 return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder); 4473 } 4474 4475 MachineInstr * 4476 AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS, 4477 MachineOperand &RHS, 4478 MachineIRBuilder &MIRBuilder) const { 4479 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4480 MachineRegisterInfo *MRI = MIRBuilder.getMRI(); 4481 bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32); 4482 static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr}; 4483 return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder); 4484 } 4485 4486 MachineInstr * 4487 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, 4488 MachineIRBuilder &MIRBuilder) const { 4489 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4490 bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32); 4491 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass; 4492 return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder); 4493 } 4494 4495 MachineInstr * 4496 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS, 4497 MachineIRBuilder &MIRBuilder) const { 4498 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); 4499 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4500 LLT Ty = MRI.getType(LHS.getReg()); 4501 unsigned RegSize = Ty.getSizeInBits(); 4502 bool Is32Bit = (RegSize == 32); 4503 const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri}, 4504 {AArch64::ANDSXrs, AArch64::ANDSWrs}, 4505 {AArch64::ANDSXrr, AArch64::ANDSWrr}}; 4506 // ANDS needs a logical immediate for its immediate form. Check if we can 4507 // fold one in. 4508 if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) { 4509 int64_t Imm = ValAndVReg->Value.getSExtValue(); 4510 4511 if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) { 4512 auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS}); 4513 TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize)); 4514 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); 4515 return &*TstMI; 4516 } 4517 } 4518 4519 if (auto Fns = selectLogicalShiftedRegister(RHS)) 4520 return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns); 4521 return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder); 4522 } 4523 4524 MachineInstr *AArch64InstructionSelector::emitIntegerCompare( 4525 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, 4526 MachineIRBuilder &MIRBuilder) const { 4527 assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); 4528 assert(Predicate.isPredicate() && "Expected predicate?"); 4529 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4530 LLT CmpTy = MRI.getType(LHS.getReg()); 4531 assert(!CmpTy.isVector() && "Expected scalar or pointer"); 4532 unsigned Size = CmpTy.getSizeInBits(); 4533 (void)Size; 4534 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?"); 4535 // Fold the compare into a cmn or tst if possible. 4536 if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder)) 4537 return FoldCmp; 4538 auto Dst = MRI.cloneVirtualRegister(LHS.getReg()); 4539 return emitSUBS(Dst, LHS, RHS, MIRBuilder); 4540 } 4541 4542 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp( 4543 Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const { 4544 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4545 #ifndef NDEBUG 4546 LLT Ty = MRI.getType(Dst); 4547 assert(!Ty.isVector() && Ty.getSizeInBits() == 32 && 4548 "Expected a 32-bit scalar register?"); 4549 #endif 4550 const Register ZReg = AArch64::WZR; 4551 AArch64CC::CondCode CC1, CC2; 4552 changeFCMPPredToAArch64CC(Pred, CC1, CC2); 4553 auto InvCC1 = AArch64CC::getInvertedCondCode(CC1); 4554 if (CC2 == AArch64CC::AL) 4555 return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, 4556 MIRBuilder); 4557 const TargetRegisterClass *RC = &AArch64::GPR32RegClass; 4558 Register Def1Reg = MRI.createVirtualRegister(RC); 4559 Register Def2Reg = MRI.createVirtualRegister(RC); 4560 auto InvCC2 = AArch64CC::getInvertedCondCode(CC2); 4561 emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder); 4562 emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder); 4563 auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg}); 4564 constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI); 4565 return &*OrMI; 4566 } 4567 4568 MachineInstr *AArch64InstructionSelector::emitFPCompare( 4569 Register LHS, Register RHS, MachineIRBuilder &MIRBuilder, 4570 std::optional<CmpInst::Predicate> Pred) const { 4571 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 4572 LLT Ty = MRI.getType(LHS); 4573 if (Ty.isVector()) 4574 return nullptr; 4575 unsigned OpSize = Ty.getSizeInBits(); 4576 assert(OpSize == 16 || OpSize == 32 || OpSize == 64); 4577 4578 // If this is a compare against +0.0, then we don't have 4579 // to explicitly materialize a constant. 4580 const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI); 4581 bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative()); 4582 4583 auto IsEqualityPred = [](CmpInst::Predicate P) { 4584 return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE || 4585 P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE; 4586 }; 4587 if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) { 4588 // Try commutating the operands. 4589 const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI); 4590 if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) { 4591 ShouldUseImm = true; 4592 std::swap(LHS, RHS); 4593 } 4594 } 4595 unsigned CmpOpcTbl[2][3] = { 4596 {AArch64::FCMPHrr, AArch64::FCMPSrr, AArch64::FCMPDrr}, 4597 {AArch64::FCMPHri, AArch64::FCMPSri, AArch64::FCMPDri}}; 4598 unsigned CmpOpc = 4599 CmpOpcTbl[ShouldUseImm][OpSize == 16 ? 0 : (OpSize == 32 ? 1 : 2)]; 4600 4601 // Partially build the compare. Decide if we need to add a use for the 4602 // third operand based off whether or not we're comparing against 0.0. 4603 auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS); 4604 CmpMI.setMIFlags(MachineInstr::NoFPExcept); 4605 if (!ShouldUseImm) 4606 CmpMI.addUse(RHS); 4607 constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); 4608 return &*CmpMI; 4609 } 4610 4611 MachineInstr *AArch64InstructionSelector::emitVectorConcat( 4612 std::optional<Register> Dst, Register Op1, Register Op2, 4613 MachineIRBuilder &MIRBuilder) const { 4614 // We implement a vector concat by: 4615 // 1. Use scalar_to_vector to insert the lower vector into the larger dest 4616 // 2. Insert the upper vector into the destination's upper element 4617 // TODO: some of this code is common with G_BUILD_VECTOR handling. 4618 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); 4619 4620 const LLT Op1Ty = MRI.getType(Op1); 4621 const LLT Op2Ty = MRI.getType(Op2); 4622 4623 if (Op1Ty != Op2Ty) { 4624 LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys"); 4625 return nullptr; 4626 } 4627 assert(Op1Ty.isVector() && "Expected a vector for vector concat"); 4628 4629 if (Op1Ty.getSizeInBits() >= 128) { 4630 LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors"); 4631 return nullptr; 4632 } 4633 4634 // At the moment we just support 64 bit vector concats. 4635 if (Op1Ty.getSizeInBits() != 64) { 4636 LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors"); 4637 return nullptr; 4638 } 4639 4640 const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits()); 4641 const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI); 4642 const TargetRegisterClass *DstRC = 4643 getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank); 4644 4645 MachineInstr *WidenedOp1 = 4646 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder); 4647 MachineInstr *WidenedOp2 = 4648 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder); 4649 if (!WidenedOp1 || !WidenedOp2) { 4650 LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value"); 4651 return nullptr; 4652 } 4653 4654 // Now do the insert of the upper element. 4655 unsigned InsertOpc, InsSubRegIdx; 4656 std::tie(InsertOpc, InsSubRegIdx) = 4657 getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits()); 4658 4659 if (!Dst) 4660 Dst = MRI.createVirtualRegister(DstRC); 4661 auto InsElt = 4662 MIRBuilder 4663 .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()}) 4664 .addImm(1) /* Lane index */ 4665 .addUse(WidenedOp2->getOperand(0).getReg()) 4666 .addImm(0); 4667 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); 4668 return &*InsElt; 4669 } 4670 4671 MachineInstr * 4672 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1, 4673 Register Src2, AArch64CC::CondCode Pred, 4674 MachineIRBuilder &MIRBuilder) const { 4675 auto &MRI = *MIRBuilder.getMRI(); 4676 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst); 4677 // If we used a register class, then this won't necessarily have an LLT. 4678 // Compute the size based off whether or not we have a class or bank. 4679 unsigned Size; 4680 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank)) 4681 Size = TRI.getRegSizeInBits(*RC); 4682 else 4683 Size = MRI.getType(Dst).getSizeInBits(); 4684 // Some opcodes use s1. 4685 assert(Size <= 64 && "Expected 64 bits or less only!"); 4686 static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr}; 4687 unsigned Opc = OpcTable[Size == 64]; 4688 auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred); 4689 constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI); 4690 return &*CSINC; 4691 } 4692 4693 MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I, 4694 Register CarryReg) { 4695 MachineRegisterInfo *MRI = MIB.getMRI(); 4696 unsigned Opcode = I.getOpcode(); 4697 4698 // If the instruction is a SUB, we need to negate the carry, 4699 // because borrowing is indicated by carry-flag == 0. 4700 bool NeedsNegatedCarry = 4701 (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE); 4702 4703 // If the previous instruction will already produce the correct carry, do not 4704 // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences 4705 // generated during legalization of wide add/sub. This optimization depends on 4706 // these sequences not being interrupted by other instructions. 4707 // We have to select the previous instruction before the carry-using 4708 // instruction is deleted by the calling function, otherwise the previous 4709 // instruction might become dead and would get deleted. 4710 MachineInstr *SrcMI = MRI->getVRegDef(CarryReg); 4711 if (SrcMI == I.getPrevNode()) { 4712 if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(SrcMI)) { 4713 bool ProducesNegatedCarry = CarrySrcMI->isSub(); 4714 if (NeedsNegatedCarry == ProducesNegatedCarry && 4715 CarrySrcMI->isUnsigned() && 4716 CarrySrcMI->getCarryOutReg() == CarryReg && 4717 selectAndRestoreState(*SrcMI)) 4718 return nullptr; 4719 } 4720 } 4721 4722 Register DeadReg = MRI->createVirtualRegister(&AArch64::GPR32RegClass); 4723 4724 if (NeedsNegatedCarry) { 4725 // (0 - Carry) sets !C in NZCV when Carry == 1 4726 Register ZReg = AArch64::WZR; 4727 return emitInstr(AArch64::SUBSWrr, {DeadReg}, {ZReg, CarryReg}, MIB); 4728 } 4729 4730 // (Carry - 1) sets !C in NZCV when Carry == 0 4731 auto Fns = select12BitValueWithLeftShift(1); 4732 return emitInstr(AArch64::SUBSWri, {DeadReg}, {CarryReg}, MIB, Fns); 4733 } 4734 4735 bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I, 4736 MachineRegisterInfo &MRI) { 4737 auto &CarryMI = cast<GAddSubCarryOut>(I); 4738 4739 if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(&I)) { 4740 // Set NZCV carry according to carry-in VReg 4741 emitCarryIn(I, CarryInMI->getCarryInReg()); 4742 } 4743 4744 // Emit the operation and get the correct condition code. 4745 auto OpAndCC = emitOverflowOp(I.getOpcode(), CarryMI.getDstReg(), 4746 CarryMI.getLHS(), CarryMI.getRHS(), MIB); 4747 4748 Register CarryOutReg = CarryMI.getCarryOutReg(); 4749 4750 // Don't convert carry-out to VReg if it is never used 4751 if (!MRI.use_nodbg_empty(CarryOutReg)) { 4752 // Now, put the overflow result in the register given by the first operand 4753 // to the overflow op. CSINC increments the result when the predicate is 4754 // false, so to get the increment when it's true, we need to use the 4755 // inverse. In this case, we want to increment when carry is set. 4756 Register ZReg = AArch64::WZR; 4757 emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg, 4758 getInvertedCondCode(OpAndCC.second), MIB); 4759 } 4760 4761 I.eraseFromParent(); 4762 return true; 4763 } 4764 4765 std::pair<MachineInstr *, AArch64CC::CondCode> 4766 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst, 4767 MachineOperand &LHS, 4768 MachineOperand &RHS, 4769 MachineIRBuilder &MIRBuilder) const { 4770 switch (Opcode) { 4771 default: 4772 llvm_unreachable("Unexpected opcode!"); 4773 case TargetOpcode::G_SADDO: 4774 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4775 case TargetOpcode::G_UADDO: 4776 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS); 4777 case TargetOpcode::G_SSUBO: 4778 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4779 case TargetOpcode::G_USUBO: 4780 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO); 4781 case TargetOpcode::G_SADDE: 4782 return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4783 case TargetOpcode::G_UADDE: 4784 return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS); 4785 case TargetOpcode::G_SSUBE: 4786 return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); 4787 case TargetOpcode::G_USUBE: 4788 return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO); 4789 } 4790 } 4791 4792 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be 4793 /// expressed as a conjunction. 4794 /// \param CanNegate Set to true if we can negate the whole sub-tree just by 4795 /// changing the conditions on the CMP tests. 4796 /// (this means we can call emitConjunctionRec() with 4797 /// Negate==true on this sub-tree) 4798 /// \param MustBeFirst Set to true if this subtree needs to be negated and we 4799 /// cannot do the negation naturally. We are required to 4800 /// emit the subtree first in this case. 4801 /// \param WillNegate Is true if are called when the result of this 4802 /// subexpression must be negated. This happens when the 4803 /// outer expression is an OR. We can use this fact to know 4804 /// that we have a double negation (or (or ...) ...) that 4805 /// can be implemented for free. 4806 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst, 4807 bool WillNegate, MachineRegisterInfo &MRI, 4808 unsigned Depth = 0) { 4809 if (!MRI.hasOneNonDBGUse(Val)) 4810 return false; 4811 MachineInstr *ValDef = MRI.getVRegDef(Val); 4812 unsigned Opcode = ValDef->getOpcode(); 4813 if (isa<GAnyCmp>(ValDef)) { 4814 CanNegate = true; 4815 MustBeFirst = false; 4816 return true; 4817 } 4818 // Protect against exponential runtime and stack overflow. 4819 if (Depth > 6) 4820 return false; 4821 if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) { 4822 bool IsOR = Opcode == TargetOpcode::G_OR; 4823 Register O0 = ValDef->getOperand(1).getReg(); 4824 Register O1 = ValDef->getOperand(2).getReg(); 4825 bool CanNegateL; 4826 bool MustBeFirstL; 4827 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1)) 4828 return false; 4829 bool CanNegateR; 4830 bool MustBeFirstR; 4831 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1)) 4832 return false; 4833 4834 if (MustBeFirstL && MustBeFirstR) 4835 return false; 4836 4837 if (IsOR) { 4838 // For an OR expression we need to be able to naturally negate at least 4839 // one side or we cannot do the transformation at all. 4840 if (!CanNegateL && !CanNegateR) 4841 return false; 4842 // If we the result of the OR will be negated and we can naturally negate 4843 // the leaves, then this sub-tree as a whole negates naturally. 4844 CanNegate = WillNegate && CanNegateL && CanNegateR; 4845 // If we cannot naturally negate the whole sub-tree, then this must be 4846 // emitted first. 4847 MustBeFirst = !CanNegate; 4848 } else { 4849 assert(Opcode == TargetOpcode::G_AND && "Must be G_AND"); 4850 // We cannot naturally negate an AND operation. 4851 CanNegate = false; 4852 MustBeFirst = MustBeFirstL || MustBeFirstR; 4853 } 4854 return true; 4855 } 4856 return false; 4857 } 4858 4859 MachineInstr *AArch64InstructionSelector::emitConditionalComparison( 4860 Register LHS, Register RHS, CmpInst::Predicate CC, 4861 AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, 4862 MachineIRBuilder &MIB) const { 4863 auto &MRI = *MIB.getMRI(); 4864 LLT OpTy = MRI.getType(LHS); 4865 unsigned CCmpOpc; 4866 std::optional<ValueAndVReg> C; 4867 if (CmpInst::isIntPredicate(CC)) { 4868 assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64); 4869 C = getIConstantVRegValWithLookThrough(RHS, MRI); 4870 if (!C || C->Value.sgt(31) || C->Value.slt(-31)) 4871 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr; 4872 else if (C->Value.ule(31)) 4873 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi; 4874 else 4875 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMNWi : AArch64::CCMNXi; 4876 } else { 4877 assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 || 4878 OpTy.getSizeInBits() == 64); 4879 switch (OpTy.getSizeInBits()) { 4880 case 16: 4881 assert(STI.hasFullFP16() && "Expected Full FP16 for fp16 comparisons"); 4882 CCmpOpc = AArch64::FCCMPHrr; 4883 break; 4884 case 32: 4885 CCmpOpc = AArch64::FCCMPSrr; 4886 break; 4887 case 64: 4888 CCmpOpc = AArch64::FCCMPDrr; 4889 break; 4890 default: 4891 return nullptr; 4892 } 4893 } 4894 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 4895 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 4896 auto CCmp = 4897 MIB.buildInstr(CCmpOpc, {}, {LHS}); 4898 if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi) 4899 CCmp.addImm(C->Value.getZExtValue()); 4900 else if (CCmpOpc == AArch64::CCMNWi || CCmpOpc == AArch64::CCMNXi) 4901 CCmp.addImm(C->Value.abs().getZExtValue()); 4902 else 4903 CCmp.addReg(RHS); 4904 CCmp.addImm(NZCV).addImm(Predicate); 4905 constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI); 4906 return &*CCmp; 4907 } 4908 4909 MachineInstr *AArch64InstructionSelector::emitConjunctionRec( 4910 Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp, 4911 AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const { 4912 // We're at a tree leaf, produce a conditional comparison operation. 4913 auto &MRI = *MIB.getMRI(); 4914 MachineInstr *ValDef = MRI.getVRegDef(Val); 4915 unsigned Opcode = ValDef->getOpcode(); 4916 if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) { 4917 Register LHS = Cmp->getLHSReg(); 4918 Register RHS = Cmp->getRHSReg(); 4919 CmpInst::Predicate CC = Cmp->getCond(); 4920 if (Negate) 4921 CC = CmpInst::getInversePredicate(CC); 4922 if (isa<GICmp>(Cmp)) { 4923 OutCC = changeICMPPredToAArch64CC(CC); 4924 } else { 4925 // Handle special FP cases. 4926 AArch64CC::CondCode ExtraCC; 4927 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); 4928 // Some floating point conditions can't be tested with a single condition 4929 // code. Construct an additional comparison in this case. 4930 if (ExtraCC != AArch64CC::AL) { 4931 MachineInstr *ExtraCmp; 4932 if (!CCOp) 4933 ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC); 4934 else 4935 ExtraCmp = 4936 emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB); 4937 CCOp = ExtraCmp->getOperand(0).getReg(); 4938 Predicate = ExtraCC; 4939 } 4940 } 4941 4942 // Produce a normal comparison if we are first in the chain 4943 if (!CCOp) { 4944 auto Dst = MRI.cloneVirtualRegister(LHS); 4945 if (isa<GICmp>(Cmp)) 4946 return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB); 4947 return emitFPCompare(Cmp->getOperand(2).getReg(), 4948 Cmp->getOperand(3).getReg(), MIB); 4949 } 4950 // Otherwise produce a ccmp. 4951 return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB); 4952 } 4953 assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree"); 4954 4955 bool IsOR = Opcode == TargetOpcode::G_OR; 4956 4957 Register LHS = ValDef->getOperand(1).getReg(); 4958 bool CanNegateL; 4959 bool MustBeFirstL; 4960 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI); 4961 assert(ValidL && "Valid conjunction/disjunction tree"); 4962 (void)ValidL; 4963 4964 Register RHS = ValDef->getOperand(2).getReg(); 4965 bool CanNegateR; 4966 bool MustBeFirstR; 4967 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI); 4968 assert(ValidR && "Valid conjunction/disjunction tree"); 4969 (void)ValidR; 4970 4971 // Swap sub-tree that must come first to the right side. 4972 if (MustBeFirstL) { 4973 assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); 4974 std::swap(LHS, RHS); 4975 std::swap(CanNegateL, CanNegateR); 4976 std::swap(MustBeFirstL, MustBeFirstR); 4977 } 4978 4979 bool NegateR; 4980 bool NegateAfterR; 4981 bool NegateL; 4982 bool NegateAfterAll; 4983 if (Opcode == TargetOpcode::G_OR) { 4984 // Swap the sub-tree that we can negate naturally to the left. 4985 if (!CanNegateL) { 4986 assert(CanNegateR && "at least one side must be negatable"); 4987 assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); 4988 assert(!Negate); 4989 std::swap(LHS, RHS); 4990 NegateR = false; 4991 NegateAfterR = true; 4992 } else { 4993 // Negate the left sub-tree if possible, otherwise negate the result. 4994 NegateR = CanNegateR; 4995 NegateAfterR = !CanNegateR; 4996 } 4997 NegateL = true; 4998 NegateAfterAll = !Negate; 4999 } else { 5000 assert(Opcode == TargetOpcode::G_AND && 5001 "Valid conjunction/disjunction tree"); 5002 assert(!Negate && "Valid conjunction/disjunction tree"); 5003 5004 NegateL = false; 5005 NegateR = false; 5006 NegateAfterR = false; 5007 NegateAfterAll = false; 5008 } 5009 5010 // Emit sub-trees. 5011 AArch64CC::CondCode RHSCC; 5012 MachineInstr *CmpR = 5013 emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB); 5014 if (NegateAfterR) 5015 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 5016 MachineInstr *CmpL = emitConjunctionRec( 5017 LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB); 5018 if (NegateAfterAll) 5019 OutCC = AArch64CC::getInvertedCondCode(OutCC); 5020 return CmpL; 5021 } 5022 5023 MachineInstr *AArch64InstructionSelector::emitConjunction( 5024 Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const { 5025 bool DummyCanNegate; 5026 bool DummyMustBeFirst; 5027 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false, 5028 *MIB.getMRI())) 5029 return nullptr; 5030 return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB); 5031 } 5032 5033 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI, 5034 MachineInstr &CondMI) { 5035 AArch64CC::CondCode AArch64CC; 5036 MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB); 5037 if (!ConjMI) 5038 return false; 5039 5040 emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB); 5041 SelI.eraseFromParent(); 5042 return true; 5043 } 5044 5045 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) { 5046 MachineRegisterInfo &MRI = *MIB.getMRI(); 5047 // We want to recognize this pattern: 5048 // 5049 // $z = G_FCMP pred, $x, $y 5050 // ... 5051 // $w = G_SELECT $z, $a, $b 5052 // 5053 // Where the value of $z is *only* ever used by the G_SELECT (possibly with 5054 // some copies/truncs in between.) 5055 // 5056 // If we see this, then we can emit something like this: 5057 // 5058 // fcmp $x, $y 5059 // fcsel $w, $a, $b, pred 5060 // 5061 // Rather than emitting both of the rather long sequences in the standard 5062 // G_FCMP/G_SELECT select methods. 5063 5064 // First, check if the condition is defined by a compare. 5065 MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); 5066 5067 // We can only fold if all of the defs have one use. 5068 Register CondDefReg = CondDef->getOperand(0).getReg(); 5069 if (!MRI.hasOneNonDBGUse(CondDefReg)) { 5070 // Unless it's another select. 5071 for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { 5072 if (CondDef == &UI) 5073 continue; 5074 if (UI.getOpcode() != TargetOpcode::G_SELECT) 5075 return false; 5076 } 5077 } 5078 5079 // Is the condition defined by a compare? 5080 unsigned CondOpc = CondDef->getOpcode(); 5081 if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) { 5082 if (tryOptSelectConjunction(I, *CondDef)) 5083 return true; 5084 return false; 5085 } 5086 5087 AArch64CC::CondCode CondCode; 5088 if (CondOpc == TargetOpcode::G_ICMP) { 5089 auto Pred = 5090 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); 5091 CondCode = changeICMPPredToAArch64CC(Pred); 5092 emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), 5093 CondDef->getOperand(1), MIB); 5094 } else { 5095 // Get the condition code for the select. 5096 auto Pred = 5097 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); 5098 AArch64CC::CondCode CondCode2; 5099 changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2); 5100 5101 // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two 5102 // instructions to emit the comparison. 5103 // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be 5104 // unnecessary. 5105 if (CondCode2 != AArch64CC::AL) 5106 return false; 5107 5108 if (!emitFPCompare(CondDef->getOperand(2).getReg(), 5109 CondDef->getOperand(3).getReg(), MIB)) { 5110 LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); 5111 return false; 5112 } 5113 } 5114 5115 // Emit the select. 5116 emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(), 5117 I.getOperand(3).getReg(), CondCode, MIB); 5118 I.eraseFromParent(); 5119 return true; 5120 } 5121 5122 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( 5123 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, 5124 MachineIRBuilder &MIRBuilder) const { 5125 assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() && 5126 "Unexpected MachineOperand"); 5127 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 5128 // We want to find this sort of thing: 5129 // x = G_SUB 0, y 5130 // G_ICMP z, x 5131 // 5132 // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead. 5133 // e.g: 5134 // 5135 // cmn z, y 5136 5137 // Check if the RHS or LHS of the G_ICMP is defined by a SUB 5138 MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); 5139 MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); 5140 auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate()); 5141 // Given this: 5142 // 5143 // x = G_SUB 0, y 5144 // G_ICMP x, z 5145 // 5146 // Produce this: 5147 // 5148 // cmn y, z 5149 if (isCMN(LHSDef, P, MRI)) 5150 return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); 5151 5152 // Same idea here, but with the RHS of the compare instead: 5153 // 5154 // Given this: 5155 // 5156 // x = G_SUB 0, y 5157 // G_ICMP z, x 5158 // 5159 // Produce this: 5160 // 5161 // cmn z, y 5162 if (isCMN(RHSDef, P, MRI)) 5163 return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); 5164 5165 // Given this: 5166 // 5167 // z = G_AND x, y 5168 // G_ICMP z, 0 5169 // 5170 // Produce this if the compare is signed: 5171 // 5172 // tst x, y 5173 if (!CmpInst::isUnsigned(P) && LHSDef && 5174 LHSDef->getOpcode() == TargetOpcode::G_AND) { 5175 // Make sure that the RHS is 0. 5176 auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); 5177 if (!ValAndVReg || ValAndVReg->Value != 0) 5178 return nullptr; 5179 5180 return emitTST(LHSDef->getOperand(1), 5181 LHSDef->getOperand(2), MIRBuilder); 5182 } 5183 5184 return nullptr; 5185 } 5186 5187 bool AArch64InstructionSelector::selectShuffleVector( 5188 MachineInstr &I, MachineRegisterInfo &MRI) { 5189 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 5190 Register Src1Reg = I.getOperand(1).getReg(); 5191 const LLT Src1Ty = MRI.getType(Src1Reg); 5192 Register Src2Reg = I.getOperand(2).getReg(); 5193 const LLT Src2Ty = MRI.getType(Src2Reg); 5194 ArrayRef<int> Mask = I.getOperand(3).getShuffleMask(); 5195 5196 MachineBasicBlock &MBB = *I.getParent(); 5197 MachineFunction &MF = *MBB.getParent(); 5198 LLVMContext &Ctx = MF.getFunction().getContext(); 5199 5200 // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if 5201 // it's originated from a <1 x T> type. Those should have been lowered into 5202 // G_BUILD_VECTOR earlier. 5203 if (!Src1Ty.isVector() || !Src2Ty.isVector()) { 5204 LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); 5205 return false; 5206 } 5207 5208 unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; 5209 5210 SmallVector<Constant *, 64> CstIdxs; 5211 for (int Val : Mask) { 5212 // For now, any undef indexes we'll just assume to be 0. This should be 5213 // optimized in future, e.g. to select DUP etc. 5214 Val = Val < 0 ? 0 : Val; 5215 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 5216 unsigned Offset = Byte + Val * BytesPerElt; 5217 CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); 5218 } 5219 } 5220 5221 // Use a constant pool to load the index vector for TBL. 5222 Constant *CPVal = ConstantVector::get(CstIdxs); 5223 MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB); 5224 if (!IndexLoad) { 5225 LLVM_DEBUG(dbgs() << "Could not load from a constant pool"); 5226 return false; 5227 } 5228 5229 if (DstTy.getSizeInBits() != 128) { 5230 assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty"); 5231 // This case can be done with TBL1. 5232 MachineInstr *Concat = 5233 emitVectorConcat(std::nullopt, Src1Reg, Src2Reg, MIB); 5234 if (!Concat) { 5235 LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1"); 5236 return false; 5237 } 5238 5239 // The constant pool load will be 64 bits, so need to convert to FPR128 reg. 5240 IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass, 5241 IndexLoad->getOperand(0).getReg(), MIB); 5242 5243 auto TBL1 = MIB.buildInstr( 5244 AArch64::TBLv16i8One, {&AArch64::FPR128RegClass}, 5245 {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()}); 5246 constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI); 5247 5248 auto Copy = 5249 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) 5250 .addReg(TBL1.getReg(0), 0, AArch64::dsub); 5251 RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI); 5252 I.eraseFromParent(); 5253 return true; 5254 } 5255 5256 // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive 5257 // Q registers for regalloc. 5258 SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg}; 5259 auto RegSeq = createQTuple(Regs, MIB); 5260 auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)}, 5261 {RegSeq, IndexLoad->getOperand(0)}); 5262 constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI); 5263 I.eraseFromParent(); 5264 return true; 5265 } 5266 5267 MachineInstr *AArch64InstructionSelector::emitLaneInsert( 5268 std::optional<Register> DstReg, Register SrcReg, Register EltReg, 5269 unsigned LaneIdx, const RegisterBank &RB, 5270 MachineIRBuilder &MIRBuilder) const { 5271 MachineInstr *InsElt = nullptr; 5272 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; 5273 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 5274 5275 // Create a register to define with the insert if one wasn't passed in. 5276 if (!DstReg) 5277 DstReg = MRI.createVirtualRegister(DstRC); 5278 5279 unsigned EltSize = MRI.getType(EltReg).getSizeInBits(); 5280 unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first; 5281 5282 if (RB.getID() == AArch64::FPRRegBankID) { 5283 auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder); 5284 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) 5285 .addImm(LaneIdx) 5286 .addUse(InsSub->getOperand(0).getReg()) 5287 .addImm(0); 5288 } else { 5289 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg}) 5290 .addImm(LaneIdx) 5291 .addUse(EltReg); 5292 } 5293 5294 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI); 5295 return InsElt; 5296 } 5297 5298 bool AArch64InstructionSelector::selectUSMovFromExtend( 5299 MachineInstr &MI, MachineRegisterInfo &MRI) { 5300 if (MI.getOpcode() != TargetOpcode::G_SEXT && 5301 MI.getOpcode() != TargetOpcode::G_ZEXT && 5302 MI.getOpcode() != TargetOpcode::G_ANYEXT) 5303 return false; 5304 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT; 5305 const Register DefReg = MI.getOperand(0).getReg(); 5306 const LLT DstTy = MRI.getType(DefReg); 5307 unsigned DstSize = DstTy.getSizeInBits(); 5308 5309 if (DstSize != 32 && DstSize != 64) 5310 return false; 5311 5312 MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT, 5313 MI.getOperand(1).getReg(), MRI); 5314 int64_t Lane; 5315 if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane))) 5316 return false; 5317 Register Src0 = Extract->getOperand(1).getReg(); 5318 5319 const LLT VecTy = MRI.getType(Src0); 5320 if (VecTy.isScalableVector()) 5321 return false; 5322 5323 if (VecTy.getSizeInBits() != 128) { 5324 const MachineInstr *ScalarToVector = emitScalarToVector( 5325 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB); 5326 assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!"); 5327 Src0 = ScalarToVector->getOperand(0).getReg(); 5328 } 5329 5330 unsigned Opcode; 5331 if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32) 5332 Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32; 5333 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16) 5334 Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16; 5335 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8) 5336 Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8; 5337 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16) 5338 Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16; 5339 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8) 5340 Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8; 5341 else 5342 llvm_unreachable("Unexpected type combo for S/UMov!"); 5343 5344 // We may need to generate one of these, depending on the type and sign of the 5345 // input: 5346 // DstReg = SMOV Src0, Lane; 5347 // NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32; 5348 MachineInstr *ExtI = nullptr; 5349 if (DstSize == 64 && !IsSigned) { 5350 Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 5351 MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane); 5352 ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {}) 5353 .addImm(0) 5354 .addUse(NewReg) 5355 .addImm(AArch64::sub_32); 5356 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 5357 } else 5358 ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane); 5359 5360 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 5361 MI.eraseFromParent(); 5362 return true; 5363 } 5364 5365 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8( 5366 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) { 5367 unsigned int Op; 5368 if (DstSize == 128) { 5369 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5370 return nullptr; 5371 Op = AArch64::MOVIv16b_ns; 5372 } else { 5373 Op = AArch64::MOVIv8b_ns; 5374 } 5375 5376 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5377 5378 if (AArch64_AM::isAdvSIMDModImmType9(Val)) { 5379 Val = AArch64_AM::encodeAdvSIMDModImmType9(Val); 5380 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val); 5381 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5382 return &*Mov; 5383 } 5384 return nullptr; 5385 } 5386 5387 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16( 5388 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder, 5389 bool Inv) { 5390 5391 unsigned int Op; 5392 if (DstSize == 128) { 5393 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5394 return nullptr; 5395 Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16; 5396 } else { 5397 Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16; 5398 } 5399 5400 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5401 uint64_t Shift; 5402 5403 if (AArch64_AM::isAdvSIMDModImmType5(Val)) { 5404 Val = AArch64_AM::encodeAdvSIMDModImmType5(Val); 5405 Shift = 0; 5406 } else if (AArch64_AM::isAdvSIMDModImmType6(Val)) { 5407 Val = AArch64_AM::encodeAdvSIMDModImmType6(Val); 5408 Shift = 8; 5409 } else 5410 return nullptr; 5411 5412 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift); 5413 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5414 return &*Mov; 5415 } 5416 5417 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32( 5418 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder, 5419 bool Inv) { 5420 5421 unsigned int Op; 5422 if (DstSize == 128) { 5423 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5424 return nullptr; 5425 Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32; 5426 } else { 5427 Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32; 5428 } 5429 5430 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5431 uint64_t Shift; 5432 5433 if ((AArch64_AM::isAdvSIMDModImmType1(Val))) { 5434 Val = AArch64_AM::encodeAdvSIMDModImmType1(Val); 5435 Shift = 0; 5436 } else if ((AArch64_AM::isAdvSIMDModImmType2(Val))) { 5437 Val = AArch64_AM::encodeAdvSIMDModImmType2(Val); 5438 Shift = 8; 5439 } else if ((AArch64_AM::isAdvSIMDModImmType3(Val))) { 5440 Val = AArch64_AM::encodeAdvSIMDModImmType3(Val); 5441 Shift = 16; 5442 } else if ((AArch64_AM::isAdvSIMDModImmType4(Val))) { 5443 Val = AArch64_AM::encodeAdvSIMDModImmType4(Val); 5444 Shift = 24; 5445 } else 5446 return nullptr; 5447 5448 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift); 5449 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5450 return &*Mov; 5451 } 5452 5453 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64( 5454 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) { 5455 5456 unsigned int Op; 5457 if (DstSize == 128) { 5458 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5459 return nullptr; 5460 Op = AArch64::MOVIv2d_ns; 5461 } else { 5462 Op = AArch64::MOVID; 5463 } 5464 5465 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5466 if (AArch64_AM::isAdvSIMDModImmType10(Val)) { 5467 Val = AArch64_AM::encodeAdvSIMDModImmType10(Val); 5468 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val); 5469 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5470 return &*Mov; 5471 } 5472 return nullptr; 5473 } 5474 5475 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s( 5476 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder, 5477 bool Inv) { 5478 5479 unsigned int Op; 5480 if (DstSize == 128) { 5481 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5482 return nullptr; 5483 Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl; 5484 } else { 5485 Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl; 5486 } 5487 5488 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5489 uint64_t Shift; 5490 5491 if (AArch64_AM::isAdvSIMDModImmType7(Val)) { 5492 Val = AArch64_AM::encodeAdvSIMDModImmType7(Val); 5493 Shift = 264; 5494 } else if (AArch64_AM::isAdvSIMDModImmType8(Val)) { 5495 Val = AArch64_AM::encodeAdvSIMDModImmType8(Val); 5496 Shift = 272; 5497 } else 5498 return nullptr; 5499 5500 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift); 5501 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5502 return &*Mov; 5503 } 5504 5505 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP( 5506 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) { 5507 5508 unsigned int Op; 5509 bool IsWide = false; 5510 if (DstSize == 128) { 5511 if (Bits.getHiBits(64) != Bits.getLoBits(64)) 5512 return nullptr; 5513 Op = AArch64::FMOVv4f32_ns; 5514 IsWide = true; 5515 } else { 5516 Op = AArch64::FMOVv2f32_ns; 5517 } 5518 5519 uint64_t Val = Bits.zextOrTrunc(64).getZExtValue(); 5520 5521 if (AArch64_AM::isAdvSIMDModImmType11(Val)) { 5522 Val = AArch64_AM::encodeAdvSIMDModImmType11(Val); 5523 } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Val)) { 5524 Val = AArch64_AM::encodeAdvSIMDModImmType12(Val); 5525 Op = AArch64::FMOVv2f64_ns; 5526 } else 5527 return nullptr; 5528 5529 auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val); 5530 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5531 return &*Mov; 5532 } 5533 5534 bool AArch64InstructionSelector::selectIndexedExtLoad( 5535 MachineInstr &MI, MachineRegisterInfo &MRI) { 5536 auto &ExtLd = cast<GIndexedAnyExtLoad>(MI); 5537 Register Dst = ExtLd.getDstReg(); 5538 Register WriteBack = ExtLd.getWritebackReg(); 5539 Register Base = ExtLd.getBaseReg(); 5540 Register Offset = ExtLd.getOffsetReg(); 5541 LLT Ty = MRI.getType(Dst); 5542 assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs. 5543 unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits(); 5544 bool IsPre = ExtLd.isPre(); 5545 bool IsSExt = isa<GIndexedSExtLoad>(ExtLd); 5546 bool InsertIntoXReg = false; 5547 bool IsDst64 = Ty.getSizeInBits() == 64; 5548 5549 unsigned Opc = 0; 5550 LLT NewLdDstTy; 5551 LLT s32 = LLT::scalar(32); 5552 LLT s64 = LLT::scalar(64); 5553 5554 if (MemSizeBits == 8) { 5555 if (IsSExt) { 5556 if (IsDst64) 5557 Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost; 5558 else 5559 Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost; 5560 NewLdDstTy = IsDst64 ? s64 : s32; 5561 } else { 5562 Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost; 5563 InsertIntoXReg = IsDst64; 5564 NewLdDstTy = s32; 5565 } 5566 } else if (MemSizeBits == 16) { 5567 if (IsSExt) { 5568 if (IsDst64) 5569 Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost; 5570 else 5571 Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost; 5572 NewLdDstTy = IsDst64 ? s64 : s32; 5573 } else { 5574 Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost; 5575 InsertIntoXReg = IsDst64; 5576 NewLdDstTy = s32; 5577 } 5578 } else if (MemSizeBits == 32) { 5579 if (IsSExt) { 5580 Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost; 5581 NewLdDstTy = s64; 5582 } else { 5583 Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost; 5584 InsertIntoXReg = IsDst64; 5585 NewLdDstTy = s32; 5586 } 5587 } else { 5588 llvm_unreachable("Unexpected size for indexed load"); 5589 } 5590 5591 if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5592 return false; // We should be on gpr. 5593 5594 auto Cst = getIConstantVRegVal(Offset, MRI); 5595 if (!Cst) 5596 return false; // Shouldn't happen, but just in case. 5597 5598 auto LdMI = MIB.buildInstr(Opc, {WriteBack, NewLdDstTy}, {Base}) 5599 .addImm(Cst->getSExtValue()); 5600 LdMI.cloneMemRefs(ExtLd); 5601 constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI); 5602 // Make sure to select the load with the MemTy as the dest type, and then 5603 // insert into X reg if needed. 5604 if (InsertIntoXReg) { 5605 // Generate a SUBREG_TO_REG. 5606 auto SubToReg = MIB.buildInstr(TargetOpcode::SUBREG_TO_REG, {Dst}, {}) 5607 .addImm(0) 5608 .addUse(LdMI.getReg(1)) 5609 .addImm(AArch64::sub_32); 5610 RBI.constrainGenericRegister(SubToReg.getReg(0), AArch64::GPR64RegClass, 5611 MRI); 5612 } else { 5613 auto Copy = MIB.buildCopy(Dst, LdMI.getReg(1)); 5614 selectCopy(*Copy, TII, MRI, TRI, RBI); 5615 } 5616 MI.eraseFromParent(); 5617 5618 return true; 5619 } 5620 5621 bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI, 5622 MachineRegisterInfo &MRI) { 5623 auto &Ld = cast<GIndexedLoad>(MI); 5624 Register Dst = Ld.getDstReg(); 5625 Register WriteBack = Ld.getWritebackReg(); 5626 Register Base = Ld.getBaseReg(); 5627 Register Offset = Ld.getOffsetReg(); 5628 assert(MRI.getType(Dst).getSizeInBits() <= 128 && 5629 "Unexpected type for indexed load"); 5630 unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes(); 5631 5632 if (MemSize < MRI.getType(Dst).getSizeInBytes()) 5633 return selectIndexedExtLoad(MI, MRI); 5634 5635 unsigned Opc = 0; 5636 if (Ld.isPre()) { 5637 static constexpr unsigned GPROpcodes[] = { 5638 AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre, 5639 AArch64::LDRXpre}; 5640 static constexpr unsigned FPROpcodes[] = { 5641 AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre, 5642 AArch64::LDRQpre}; 5643 if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5644 Opc = FPROpcodes[Log2_32(MemSize)]; 5645 else 5646 Opc = GPROpcodes[Log2_32(MemSize)]; 5647 } else { 5648 static constexpr unsigned GPROpcodes[] = { 5649 AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost, 5650 AArch64::LDRXpost}; 5651 static constexpr unsigned FPROpcodes[] = { 5652 AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost, 5653 AArch64::LDRDpost, AArch64::LDRQpost}; 5654 if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5655 Opc = FPROpcodes[Log2_32(MemSize)]; 5656 else 5657 Opc = GPROpcodes[Log2_32(MemSize)]; 5658 } 5659 auto Cst = getIConstantVRegVal(Offset, MRI); 5660 if (!Cst) 5661 return false; // Shouldn't happen, but just in case. 5662 auto LdMI = 5663 MIB.buildInstr(Opc, {WriteBack, Dst}, {Base}).addImm(Cst->getSExtValue()); 5664 LdMI.cloneMemRefs(Ld); 5665 constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI); 5666 MI.eraseFromParent(); 5667 return true; 5668 } 5669 5670 bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I, 5671 MachineRegisterInfo &MRI) { 5672 Register Dst = I.getWritebackReg(); 5673 Register Val = I.getValueReg(); 5674 Register Base = I.getBaseReg(); 5675 Register Offset = I.getOffsetReg(); 5676 LLT ValTy = MRI.getType(Val); 5677 assert(ValTy.getSizeInBits() <= 128 && "Unexpected type for indexed store"); 5678 5679 unsigned Opc = 0; 5680 if (I.isPre()) { 5681 static constexpr unsigned GPROpcodes[] = { 5682 AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre, 5683 AArch64::STRXpre}; 5684 static constexpr unsigned FPROpcodes[] = { 5685 AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre, 5686 AArch64::STRQpre}; 5687 5688 if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5689 Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())]; 5690 else 5691 Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())]; 5692 } else { 5693 static constexpr unsigned GPROpcodes[] = { 5694 AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost, 5695 AArch64::STRXpost}; 5696 static constexpr unsigned FPROpcodes[] = { 5697 AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost, 5698 AArch64::STRDpost, AArch64::STRQpost}; 5699 5700 if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID) 5701 Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())]; 5702 else 5703 Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())]; 5704 } 5705 5706 auto Cst = getIConstantVRegVal(Offset, MRI); 5707 if (!Cst) 5708 return false; // Shouldn't happen, but just in case. 5709 auto Str = 5710 MIB.buildInstr(Opc, {Dst}, {Val, Base}).addImm(Cst->getSExtValue()); 5711 Str.cloneMemRefs(I); 5712 constrainSelectedInstRegOperands(*Str, TII, TRI, RBI); 5713 I.eraseFromParent(); 5714 return true; 5715 } 5716 5717 MachineInstr * 5718 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV, 5719 MachineIRBuilder &MIRBuilder, 5720 MachineRegisterInfo &MRI) { 5721 LLT DstTy = MRI.getType(Dst); 5722 unsigned DstSize = DstTy.getSizeInBits(); 5723 if (CV->isNullValue()) { 5724 if (DstSize == 128) { 5725 auto Mov = 5726 MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0); 5727 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); 5728 return &*Mov; 5729 } 5730 5731 if (DstSize == 64) { 5732 auto Mov = 5733 MIRBuilder 5734 .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {}) 5735 .addImm(0); 5736 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {}) 5737 .addReg(Mov.getReg(0), 0, AArch64::dsub); 5738 RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI); 5739 return &*Copy; 5740 } 5741 } 5742 5743 if (CV->getSplatValue()) { 5744 APInt DefBits = APInt::getSplat( 5745 DstSize, CV->getUniqueInteger().trunc(DstTy.getScalarSizeInBits())); 5746 auto TryMOVIWithBits = [&](APInt DefBits) -> MachineInstr * { 5747 MachineInstr *NewOp; 5748 bool Inv = false; 5749 if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, DefBits, MIRBuilder)) || 5750 (NewOp = 5751 tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5752 (NewOp = 5753 tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5754 (NewOp = 5755 tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5756 (NewOp = tryAdvSIMDModImm8(Dst, DstSize, DefBits, MIRBuilder)) || 5757 (NewOp = tryAdvSIMDModImmFP(Dst, DstSize, DefBits, MIRBuilder))) 5758 return NewOp; 5759 5760 DefBits = ~DefBits; 5761 Inv = true; 5762 if ((NewOp = 5763 tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5764 (NewOp = 5765 tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) || 5766 (NewOp = tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv))) 5767 return NewOp; 5768 return nullptr; 5769 }; 5770 5771 if (auto *NewOp = TryMOVIWithBits(DefBits)) 5772 return NewOp; 5773 5774 // See if a fneg of the constant can be materialized with a MOVI, etc 5775 auto TryWithFNeg = [&](APInt DefBits, int NumBits, 5776 unsigned NegOpc) -> MachineInstr * { 5777 // FNegate each sub-element of the constant 5778 APInt Neg = APInt::getHighBitsSet(NumBits, 1).zext(DstSize); 5779 APInt NegBits(DstSize, 0); 5780 unsigned NumElts = DstSize / NumBits; 5781 for (unsigned i = 0; i < NumElts; i++) 5782 NegBits |= Neg << (NumBits * i); 5783 NegBits = DefBits ^ NegBits; 5784 5785 // Try to create the new constants with MOVI, and if so generate a fneg 5786 // for it. 5787 if (auto *NewOp = TryMOVIWithBits(NegBits)) { 5788 Register NewDst = MRI.createVirtualRegister(&AArch64::FPR128RegClass); 5789 NewOp->getOperand(0).setReg(NewDst); 5790 return MIRBuilder.buildInstr(NegOpc, {Dst}, {NewDst}); 5791 } 5792 return nullptr; 5793 }; 5794 MachineInstr *R; 5795 if ((R = TryWithFNeg(DefBits, 32, AArch64::FNEGv4f32)) || 5796 (R = TryWithFNeg(DefBits, 64, AArch64::FNEGv2f64)) || 5797 (STI.hasFullFP16() && 5798 (R = TryWithFNeg(DefBits, 16, AArch64::FNEGv8f16)))) 5799 return R; 5800 } 5801 5802 auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder); 5803 if (!CPLoad) { 5804 LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!"); 5805 return nullptr; 5806 } 5807 5808 auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0)); 5809 RBI.constrainGenericRegister( 5810 Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI); 5811 return &*Copy; 5812 } 5813 5814 bool AArch64InstructionSelector::tryOptConstantBuildVec( 5815 MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) { 5816 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); 5817 unsigned DstSize = DstTy.getSizeInBits(); 5818 assert(DstSize <= 128 && "Unexpected build_vec type!"); 5819 if (DstSize < 32) 5820 return false; 5821 // Check if we're building a constant vector, in which case we want to 5822 // generate a constant pool load instead of a vector insert sequence. 5823 SmallVector<Constant *, 16> Csts; 5824 for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) { 5825 // Try to find G_CONSTANT or G_FCONSTANT 5826 auto *OpMI = 5827 getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI); 5828 if (OpMI) 5829 Csts.emplace_back( 5830 const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm())); 5831 else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT, 5832 I.getOperand(Idx).getReg(), MRI))) 5833 Csts.emplace_back( 5834 const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm())); 5835 else 5836 return false; 5837 } 5838 Constant *CV = ConstantVector::get(Csts); 5839 if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI)) 5840 return false; 5841 I.eraseFromParent(); 5842 return true; 5843 } 5844 5845 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg( 5846 MachineInstr &I, MachineRegisterInfo &MRI) { 5847 // Given: 5848 // %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef 5849 // 5850 // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt. 5851 Register Dst = I.getOperand(0).getReg(); 5852 Register EltReg = I.getOperand(1).getReg(); 5853 LLT EltTy = MRI.getType(EltReg); 5854 // If the index isn't on the same bank as its elements, then this can't be a 5855 // SUBREG_TO_REG. 5856 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI); 5857 const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI); 5858 if (EltRB != DstRB) 5859 return false; 5860 if (any_of(drop_begin(I.operands(), 2), [&MRI](const MachineOperand &Op) { 5861 return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(), MRI); 5862 })) 5863 return false; 5864 unsigned SubReg; 5865 const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB); 5866 if (!EltRC) 5867 return false; 5868 const TargetRegisterClass *DstRC = 5869 getRegClassForTypeOnBank(MRI.getType(Dst), DstRB); 5870 if (!DstRC) 5871 return false; 5872 if (!getSubRegForClass(EltRC, TRI, SubReg)) 5873 return false; 5874 auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {}) 5875 .addImm(0) 5876 .addUse(EltReg) 5877 .addImm(SubReg); 5878 I.eraseFromParent(); 5879 constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI); 5880 return RBI.constrainGenericRegister(Dst, *DstRC, MRI); 5881 } 5882 5883 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I, 5884 MachineRegisterInfo &MRI) { 5885 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); 5886 // Until we port more of the optimized selections, for now just use a vector 5887 // insert sequence. 5888 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); 5889 const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); 5890 unsigned EltSize = EltTy.getSizeInBits(); 5891 5892 if (tryOptConstantBuildVec(I, DstTy, MRI)) 5893 return true; 5894 if (tryOptBuildVecToSubregToReg(I, MRI)) 5895 return true; 5896 5897 if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64) 5898 return false; // Don't support all element types yet. 5899 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); 5900 5901 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass; 5902 MachineInstr *ScalarToVec = 5903 emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC, 5904 I.getOperand(1).getReg(), MIB); 5905 if (!ScalarToVec) 5906 return false; 5907 5908 Register DstVec = ScalarToVec->getOperand(0).getReg(); 5909 unsigned DstSize = DstTy.getSizeInBits(); 5910 5911 // Keep track of the last MI we inserted. Later on, we might be able to save 5912 // a copy using it. 5913 MachineInstr *PrevMI = ScalarToVec; 5914 for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) { 5915 // Note that if we don't do a subregister copy, we can end up making an 5916 // extra register. 5917 Register OpReg = I.getOperand(i).getReg(); 5918 // Do not emit inserts for undefs 5919 if (!getOpcodeDef<GImplicitDef>(OpReg, MRI)) { 5920 PrevMI = &*emitLaneInsert(std::nullopt, DstVec, OpReg, i - 1, RB, MIB); 5921 DstVec = PrevMI->getOperand(0).getReg(); 5922 } 5923 } 5924 5925 // If DstTy's size in bits is less than 128, then emit a subregister copy 5926 // from DstVec to the last register we've defined. 5927 if (DstSize < 128) { 5928 // Force this to be FPR using the destination vector. 5929 const TargetRegisterClass *RC = 5930 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI)); 5931 if (!RC) 5932 return false; 5933 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { 5934 LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); 5935 return false; 5936 } 5937 5938 unsigned SubReg = 0; 5939 if (!getSubRegForClass(RC, TRI, SubReg)) 5940 return false; 5941 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { 5942 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize 5943 << "\n"); 5944 return false; 5945 } 5946 5947 Register Reg = MRI.createVirtualRegister(RC); 5948 Register DstReg = I.getOperand(0).getReg(); 5949 5950 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg); 5951 MachineOperand &RegOp = I.getOperand(1); 5952 RegOp.setReg(Reg); 5953 RBI.constrainGenericRegister(DstReg, *RC, MRI); 5954 } else { 5955 // We either have a vector with all elements (except the first one) undef or 5956 // at least one non-undef non-first element. In the first case, we need to 5957 // constrain the output register ourselves as we may have generated an 5958 // INSERT_SUBREG operation which is a generic operation for which the 5959 // output regclass cannot be automatically chosen. 5960 // 5961 // In the second case, there is no need to do this as it may generate an 5962 // instruction like INSvi32gpr where the regclass can be automatically 5963 // chosen. 5964 // 5965 // Also, we save a copy by re-using the destination register on the final 5966 // insert. 5967 PrevMI->getOperand(0).setReg(I.getOperand(0).getReg()); 5968 constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI); 5969 5970 Register DstReg = PrevMI->getOperand(0).getReg(); 5971 if (PrevMI == ScalarToVec && DstReg.isVirtual()) { 5972 const TargetRegisterClass *RC = 5973 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI)); 5974 RBI.constrainGenericRegister(DstReg, *RC, MRI); 5975 } 5976 } 5977 5978 I.eraseFromParent(); 5979 return true; 5980 } 5981 5982 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc, 5983 unsigned NumVecs, 5984 MachineInstr &I) { 5985 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 5986 assert(Opc && "Expected an opcode?"); 5987 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors"); 5988 auto &MRI = *MIB.getMRI(); 5989 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 5990 unsigned Size = Ty.getSizeInBits(); 5991 assert((Size == 64 || Size == 128) && 5992 "Destination must be 64 bits or 128 bits?"); 5993 unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0; 5994 auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg(); 5995 assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?"); 5996 auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr}); 5997 Load.cloneMemRefs(I); 5998 constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); 5999 Register SelectedLoadDst = Load->getOperand(0).getReg(); 6000 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { 6001 auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {}) 6002 .addReg(SelectedLoadDst, 0, SubReg + Idx); 6003 // Emit the subreg copies and immediately select them. 6004 // FIXME: We should refactor our copy code into an emitCopy helper and 6005 // clean up uses of this pattern elsewhere in the selector. 6006 selectCopy(*Vec, TII, MRI, TRI, RBI); 6007 } 6008 return true; 6009 } 6010 6011 bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic( 6012 unsigned Opc, unsigned NumVecs, MachineInstr &I) { 6013 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 6014 assert(Opc && "Expected an opcode?"); 6015 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors"); 6016 auto &MRI = *MIB.getMRI(); 6017 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6018 bool Narrow = Ty.getSizeInBits() == 64; 6019 6020 auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1; 6021 SmallVector<Register, 4> Regs(NumVecs); 6022 std::transform(FirstSrcRegIt, FirstSrcRegIt + NumVecs, Regs.begin(), 6023 [](auto MO) { return MO.getReg(); }); 6024 6025 if (Narrow) { 6026 transform(Regs, Regs.begin(), [this](Register Reg) { 6027 return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB) 6028 ->getOperand(0) 6029 .getReg(); 6030 }); 6031 Ty = Ty.multiplyElements(2); 6032 } 6033 6034 Register Tuple = createQTuple(Regs, MIB); 6035 auto LaneNo = getIConstantVRegVal((FirstSrcRegIt + NumVecs)->getReg(), MRI); 6036 if (!LaneNo) 6037 return false; 6038 6039 Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg(); 6040 auto Load = MIB.buildInstr(Opc, {Ty}, {}) 6041 .addReg(Tuple) 6042 .addImm(LaneNo->getZExtValue()) 6043 .addReg(Ptr); 6044 Load.cloneMemRefs(I); 6045 constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); 6046 Register SelectedLoadDst = Load->getOperand(0).getReg(); 6047 unsigned SubReg = AArch64::qsub0; 6048 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { 6049 auto Vec = MIB.buildInstr(TargetOpcode::COPY, 6050 {Narrow ? DstOp(&AArch64::FPR128RegClass) 6051 : DstOp(I.getOperand(Idx).getReg())}, 6052 {}) 6053 .addReg(SelectedLoadDst, 0, SubReg + Idx); 6054 Register WideReg = Vec.getReg(0); 6055 // Emit the subreg copies and immediately select them. 6056 selectCopy(*Vec, TII, MRI, TRI, RBI); 6057 if (Narrow && 6058 !emitNarrowVector(I.getOperand(Idx).getReg(), WideReg, MIB, MRI)) 6059 return false; 6060 } 6061 return true; 6062 } 6063 6064 void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I, 6065 unsigned NumVecs, 6066 unsigned Opc) { 6067 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo(); 6068 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6069 Register Ptr = I.getOperand(1 + NumVecs).getReg(); 6070 6071 SmallVector<Register, 2> Regs(NumVecs); 6072 std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs, 6073 Regs.begin(), [](auto MO) { return MO.getReg(); }); 6074 6075 Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB) 6076 : createDTuple(Regs, MIB); 6077 auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr}); 6078 Store.cloneMemRefs(I); 6079 constrainSelectedInstRegOperands(*Store, TII, TRI, RBI); 6080 } 6081 6082 bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic( 6083 MachineInstr &I, unsigned NumVecs, unsigned Opc) { 6084 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo(); 6085 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6086 bool Narrow = Ty.getSizeInBits() == 64; 6087 6088 SmallVector<Register, 2> Regs(NumVecs); 6089 std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs, 6090 Regs.begin(), [](auto MO) { return MO.getReg(); }); 6091 6092 if (Narrow) 6093 transform(Regs, Regs.begin(), [this](Register Reg) { 6094 return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB) 6095 ->getOperand(0) 6096 .getReg(); 6097 }); 6098 6099 Register Tuple = createQTuple(Regs, MIB); 6100 6101 auto LaneNo = getIConstantVRegVal(I.getOperand(1 + NumVecs).getReg(), MRI); 6102 if (!LaneNo) 6103 return false; 6104 Register Ptr = I.getOperand(1 + NumVecs + 1).getReg(); 6105 auto Store = MIB.buildInstr(Opc, {}, {}) 6106 .addReg(Tuple) 6107 .addImm(LaneNo->getZExtValue()) 6108 .addReg(Ptr); 6109 Store.cloneMemRefs(I); 6110 constrainSelectedInstRegOperands(*Store, TII, TRI, RBI); 6111 return true; 6112 } 6113 6114 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( 6115 MachineInstr &I, MachineRegisterInfo &MRI) { 6116 // Find the intrinsic ID. 6117 unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID(); 6118 6119 const LLT S8 = LLT::scalar(8); 6120 const LLT S16 = LLT::scalar(16); 6121 const LLT S32 = LLT::scalar(32); 6122 const LLT S64 = LLT::scalar(64); 6123 const LLT P0 = LLT::pointer(0, 64); 6124 // Select the instruction. 6125 switch (IntrinID) { 6126 default: 6127 return false; 6128 case Intrinsic::aarch64_ldxp: 6129 case Intrinsic::aarch64_ldaxp: { 6130 auto NewI = MIB.buildInstr( 6131 IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX, 6132 {I.getOperand(0).getReg(), I.getOperand(1).getReg()}, 6133 {I.getOperand(3)}); 6134 NewI.cloneMemRefs(I); 6135 constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); 6136 break; 6137 } 6138 case Intrinsic::aarch64_neon_ld1x2: { 6139 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6140 unsigned Opc = 0; 6141 if (Ty == LLT::fixed_vector(8, S8)) 6142 Opc = AArch64::LD1Twov8b; 6143 else if (Ty == LLT::fixed_vector(16, S8)) 6144 Opc = AArch64::LD1Twov16b; 6145 else if (Ty == LLT::fixed_vector(4, S16)) 6146 Opc = AArch64::LD1Twov4h; 6147 else if (Ty == LLT::fixed_vector(8, S16)) 6148 Opc = AArch64::LD1Twov8h; 6149 else if (Ty == LLT::fixed_vector(2, S32)) 6150 Opc = AArch64::LD1Twov2s; 6151 else if (Ty == LLT::fixed_vector(4, S32)) 6152 Opc = AArch64::LD1Twov4s; 6153 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6154 Opc = AArch64::LD1Twov2d; 6155 else if (Ty == S64 || Ty == P0) 6156 Opc = AArch64::LD1Twov1d; 6157 else 6158 llvm_unreachable("Unexpected type for ld1x2!"); 6159 selectVectorLoadIntrinsic(Opc, 2, I); 6160 break; 6161 } 6162 case Intrinsic::aarch64_neon_ld1x3: { 6163 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6164 unsigned Opc = 0; 6165 if (Ty == LLT::fixed_vector(8, S8)) 6166 Opc = AArch64::LD1Threev8b; 6167 else if (Ty == LLT::fixed_vector(16, S8)) 6168 Opc = AArch64::LD1Threev16b; 6169 else if (Ty == LLT::fixed_vector(4, S16)) 6170 Opc = AArch64::LD1Threev4h; 6171 else if (Ty == LLT::fixed_vector(8, S16)) 6172 Opc = AArch64::LD1Threev8h; 6173 else if (Ty == LLT::fixed_vector(2, S32)) 6174 Opc = AArch64::LD1Threev2s; 6175 else if (Ty == LLT::fixed_vector(4, S32)) 6176 Opc = AArch64::LD1Threev4s; 6177 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6178 Opc = AArch64::LD1Threev2d; 6179 else if (Ty == S64 || Ty == P0) 6180 Opc = AArch64::LD1Threev1d; 6181 else 6182 llvm_unreachable("Unexpected type for ld1x3!"); 6183 selectVectorLoadIntrinsic(Opc, 3, I); 6184 break; 6185 } 6186 case Intrinsic::aarch64_neon_ld1x4: { 6187 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6188 unsigned Opc = 0; 6189 if (Ty == LLT::fixed_vector(8, S8)) 6190 Opc = AArch64::LD1Fourv8b; 6191 else if (Ty == LLT::fixed_vector(16, S8)) 6192 Opc = AArch64::LD1Fourv16b; 6193 else if (Ty == LLT::fixed_vector(4, S16)) 6194 Opc = AArch64::LD1Fourv4h; 6195 else if (Ty == LLT::fixed_vector(8, S16)) 6196 Opc = AArch64::LD1Fourv8h; 6197 else if (Ty == LLT::fixed_vector(2, S32)) 6198 Opc = AArch64::LD1Fourv2s; 6199 else if (Ty == LLT::fixed_vector(4, S32)) 6200 Opc = AArch64::LD1Fourv4s; 6201 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6202 Opc = AArch64::LD1Fourv2d; 6203 else if (Ty == S64 || Ty == P0) 6204 Opc = AArch64::LD1Fourv1d; 6205 else 6206 llvm_unreachable("Unexpected type for ld1x4!"); 6207 selectVectorLoadIntrinsic(Opc, 4, I); 6208 break; 6209 } 6210 case Intrinsic::aarch64_neon_ld2: { 6211 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6212 unsigned Opc = 0; 6213 if (Ty == LLT::fixed_vector(8, S8)) 6214 Opc = AArch64::LD2Twov8b; 6215 else if (Ty == LLT::fixed_vector(16, S8)) 6216 Opc = AArch64::LD2Twov16b; 6217 else if (Ty == LLT::fixed_vector(4, S16)) 6218 Opc = AArch64::LD2Twov4h; 6219 else if (Ty == LLT::fixed_vector(8, S16)) 6220 Opc = AArch64::LD2Twov8h; 6221 else if (Ty == LLT::fixed_vector(2, S32)) 6222 Opc = AArch64::LD2Twov2s; 6223 else if (Ty == LLT::fixed_vector(4, S32)) 6224 Opc = AArch64::LD2Twov4s; 6225 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6226 Opc = AArch64::LD2Twov2d; 6227 else if (Ty == S64 || Ty == P0) 6228 Opc = AArch64::LD1Twov1d; 6229 else 6230 llvm_unreachable("Unexpected type for ld2!"); 6231 selectVectorLoadIntrinsic(Opc, 2, I); 6232 break; 6233 } 6234 case Intrinsic::aarch64_neon_ld2lane: { 6235 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6236 unsigned Opc; 6237 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6238 Opc = AArch64::LD2i8; 6239 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6240 Opc = AArch64::LD2i16; 6241 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6242 Opc = AArch64::LD2i32; 6243 else if (Ty == LLT::fixed_vector(2, S64) || 6244 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6245 Opc = AArch64::LD2i64; 6246 else 6247 llvm_unreachable("Unexpected type for st2lane!"); 6248 if (!selectVectorLoadLaneIntrinsic(Opc, 2, I)) 6249 return false; 6250 break; 6251 } 6252 case Intrinsic::aarch64_neon_ld2r: { 6253 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6254 unsigned Opc = 0; 6255 if (Ty == LLT::fixed_vector(8, S8)) 6256 Opc = AArch64::LD2Rv8b; 6257 else if (Ty == LLT::fixed_vector(16, S8)) 6258 Opc = AArch64::LD2Rv16b; 6259 else if (Ty == LLT::fixed_vector(4, S16)) 6260 Opc = AArch64::LD2Rv4h; 6261 else if (Ty == LLT::fixed_vector(8, S16)) 6262 Opc = AArch64::LD2Rv8h; 6263 else if (Ty == LLT::fixed_vector(2, S32)) 6264 Opc = AArch64::LD2Rv2s; 6265 else if (Ty == LLT::fixed_vector(4, S32)) 6266 Opc = AArch64::LD2Rv4s; 6267 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6268 Opc = AArch64::LD2Rv2d; 6269 else if (Ty == S64 || Ty == P0) 6270 Opc = AArch64::LD2Rv1d; 6271 else 6272 llvm_unreachable("Unexpected type for ld2r!"); 6273 selectVectorLoadIntrinsic(Opc, 2, I); 6274 break; 6275 } 6276 case Intrinsic::aarch64_neon_ld3: { 6277 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6278 unsigned Opc = 0; 6279 if (Ty == LLT::fixed_vector(8, S8)) 6280 Opc = AArch64::LD3Threev8b; 6281 else if (Ty == LLT::fixed_vector(16, S8)) 6282 Opc = AArch64::LD3Threev16b; 6283 else if (Ty == LLT::fixed_vector(4, S16)) 6284 Opc = AArch64::LD3Threev4h; 6285 else if (Ty == LLT::fixed_vector(8, S16)) 6286 Opc = AArch64::LD3Threev8h; 6287 else if (Ty == LLT::fixed_vector(2, S32)) 6288 Opc = AArch64::LD3Threev2s; 6289 else if (Ty == LLT::fixed_vector(4, S32)) 6290 Opc = AArch64::LD3Threev4s; 6291 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6292 Opc = AArch64::LD3Threev2d; 6293 else if (Ty == S64 || Ty == P0) 6294 Opc = AArch64::LD1Threev1d; 6295 else 6296 llvm_unreachable("Unexpected type for ld3!"); 6297 selectVectorLoadIntrinsic(Opc, 3, I); 6298 break; 6299 } 6300 case Intrinsic::aarch64_neon_ld3lane: { 6301 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6302 unsigned Opc; 6303 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6304 Opc = AArch64::LD3i8; 6305 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6306 Opc = AArch64::LD3i16; 6307 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6308 Opc = AArch64::LD3i32; 6309 else if (Ty == LLT::fixed_vector(2, S64) || 6310 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6311 Opc = AArch64::LD3i64; 6312 else 6313 llvm_unreachable("Unexpected type for st3lane!"); 6314 if (!selectVectorLoadLaneIntrinsic(Opc, 3, I)) 6315 return false; 6316 break; 6317 } 6318 case Intrinsic::aarch64_neon_ld3r: { 6319 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6320 unsigned Opc = 0; 6321 if (Ty == LLT::fixed_vector(8, S8)) 6322 Opc = AArch64::LD3Rv8b; 6323 else if (Ty == LLT::fixed_vector(16, S8)) 6324 Opc = AArch64::LD3Rv16b; 6325 else if (Ty == LLT::fixed_vector(4, S16)) 6326 Opc = AArch64::LD3Rv4h; 6327 else if (Ty == LLT::fixed_vector(8, S16)) 6328 Opc = AArch64::LD3Rv8h; 6329 else if (Ty == LLT::fixed_vector(2, S32)) 6330 Opc = AArch64::LD3Rv2s; 6331 else if (Ty == LLT::fixed_vector(4, S32)) 6332 Opc = AArch64::LD3Rv4s; 6333 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6334 Opc = AArch64::LD3Rv2d; 6335 else if (Ty == S64 || Ty == P0) 6336 Opc = AArch64::LD3Rv1d; 6337 else 6338 llvm_unreachable("Unexpected type for ld3r!"); 6339 selectVectorLoadIntrinsic(Opc, 3, I); 6340 break; 6341 } 6342 case Intrinsic::aarch64_neon_ld4: { 6343 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6344 unsigned Opc = 0; 6345 if (Ty == LLT::fixed_vector(8, S8)) 6346 Opc = AArch64::LD4Fourv8b; 6347 else if (Ty == LLT::fixed_vector(16, S8)) 6348 Opc = AArch64::LD4Fourv16b; 6349 else if (Ty == LLT::fixed_vector(4, S16)) 6350 Opc = AArch64::LD4Fourv4h; 6351 else if (Ty == LLT::fixed_vector(8, S16)) 6352 Opc = AArch64::LD4Fourv8h; 6353 else if (Ty == LLT::fixed_vector(2, S32)) 6354 Opc = AArch64::LD4Fourv2s; 6355 else if (Ty == LLT::fixed_vector(4, S32)) 6356 Opc = AArch64::LD4Fourv4s; 6357 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6358 Opc = AArch64::LD4Fourv2d; 6359 else if (Ty == S64 || Ty == P0) 6360 Opc = AArch64::LD1Fourv1d; 6361 else 6362 llvm_unreachable("Unexpected type for ld4!"); 6363 selectVectorLoadIntrinsic(Opc, 4, I); 6364 break; 6365 } 6366 case Intrinsic::aarch64_neon_ld4lane: { 6367 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6368 unsigned Opc; 6369 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6370 Opc = AArch64::LD4i8; 6371 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6372 Opc = AArch64::LD4i16; 6373 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6374 Opc = AArch64::LD4i32; 6375 else if (Ty == LLT::fixed_vector(2, S64) || 6376 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6377 Opc = AArch64::LD4i64; 6378 else 6379 llvm_unreachable("Unexpected type for st4lane!"); 6380 if (!selectVectorLoadLaneIntrinsic(Opc, 4, I)) 6381 return false; 6382 break; 6383 } 6384 case Intrinsic::aarch64_neon_ld4r: { 6385 LLT Ty = MRI.getType(I.getOperand(0).getReg()); 6386 unsigned Opc = 0; 6387 if (Ty == LLT::fixed_vector(8, S8)) 6388 Opc = AArch64::LD4Rv8b; 6389 else if (Ty == LLT::fixed_vector(16, S8)) 6390 Opc = AArch64::LD4Rv16b; 6391 else if (Ty == LLT::fixed_vector(4, S16)) 6392 Opc = AArch64::LD4Rv4h; 6393 else if (Ty == LLT::fixed_vector(8, S16)) 6394 Opc = AArch64::LD4Rv8h; 6395 else if (Ty == LLT::fixed_vector(2, S32)) 6396 Opc = AArch64::LD4Rv2s; 6397 else if (Ty == LLT::fixed_vector(4, S32)) 6398 Opc = AArch64::LD4Rv4s; 6399 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6400 Opc = AArch64::LD4Rv2d; 6401 else if (Ty == S64 || Ty == P0) 6402 Opc = AArch64::LD4Rv1d; 6403 else 6404 llvm_unreachable("Unexpected type for ld4r!"); 6405 selectVectorLoadIntrinsic(Opc, 4, I); 6406 break; 6407 } 6408 case Intrinsic::aarch64_neon_st1x2: { 6409 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6410 unsigned Opc; 6411 if (Ty == LLT::fixed_vector(8, S8)) 6412 Opc = AArch64::ST1Twov8b; 6413 else if (Ty == LLT::fixed_vector(16, S8)) 6414 Opc = AArch64::ST1Twov16b; 6415 else if (Ty == LLT::fixed_vector(4, S16)) 6416 Opc = AArch64::ST1Twov4h; 6417 else if (Ty == LLT::fixed_vector(8, S16)) 6418 Opc = AArch64::ST1Twov8h; 6419 else if (Ty == LLT::fixed_vector(2, S32)) 6420 Opc = AArch64::ST1Twov2s; 6421 else if (Ty == LLT::fixed_vector(4, S32)) 6422 Opc = AArch64::ST1Twov4s; 6423 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6424 Opc = AArch64::ST1Twov2d; 6425 else if (Ty == S64 || Ty == P0) 6426 Opc = AArch64::ST1Twov1d; 6427 else 6428 llvm_unreachable("Unexpected type for st1x2!"); 6429 selectVectorStoreIntrinsic(I, 2, Opc); 6430 break; 6431 } 6432 case Intrinsic::aarch64_neon_st1x3: { 6433 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6434 unsigned Opc; 6435 if (Ty == LLT::fixed_vector(8, S8)) 6436 Opc = AArch64::ST1Threev8b; 6437 else if (Ty == LLT::fixed_vector(16, S8)) 6438 Opc = AArch64::ST1Threev16b; 6439 else if (Ty == LLT::fixed_vector(4, S16)) 6440 Opc = AArch64::ST1Threev4h; 6441 else if (Ty == LLT::fixed_vector(8, S16)) 6442 Opc = AArch64::ST1Threev8h; 6443 else if (Ty == LLT::fixed_vector(2, S32)) 6444 Opc = AArch64::ST1Threev2s; 6445 else if (Ty == LLT::fixed_vector(4, S32)) 6446 Opc = AArch64::ST1Threev4s; 6447 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6448 Opc = AArch64::ST1Threev2d; 6449 else if (Ty == S64 || Ty == P0) 6450 Opc = AArch64::ST1Threev1d; 6451 else 6452 llvm_unreachable("Unexpected type for st1x3!"); 6453 selectVectorStoreIntrinsic(I, 3, Opc); 6454 break; 6455 } 6456 case Intrinsic::aarch64_neon_st1x4: { 6457 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6458 unsigned Opc; 6459 if (Ty == LLT::fixed_vector(8, S8)) 6460 Opc = AArch64::ST1Fourv8b; 6461 else if (Ty == LLT::fixed_vector(16, S8)) 6462 Opc = AArch64::ST1Fourv16b; 6463 else if (Ty == LLT::fixed_vector(4, S16)) 6464 Opc = AArch64::ST1Fourv4h; 6465 else if (Ty == LLT::fixed_vector(8, S16)) 6466 Opc = AArch64::ST1Fourv8h; 6467 else if (Ty == LLT::fixed_vector(2, S32)) 6468 Opc = AArch64::ST1Fourv2s; 6469 else if (Ty == LLT::fixed_vector(4, S32)) 6470 Opc = AArch64::ST1Fourv4s; 6471 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6472 Opc = AArch64::ST1Fourv2d; 6473 else if (Ty == S64 || Ty == P0) 6474 Opc = AArch64::ST1Fourv1d; 6475 else 6476 llvm_unreachable("Unexpected type for st1x4!"); 6477 selectVectorStoreIntrinsic(I, 4, Opc); 6478 break; 6479 } 6480 case Intrinsic::aarch64_neon_st2: { 6481 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6482 unsigned Opc; 6483 if (Ty == LLT::fixed_vector(8, S8)) 6484 Opc = AArch64::ST2Twov8b; 6485 else if (Ty == LLT::fixed_vector(16, S8)) 6486 Opc = AArch64::ST2Twov16b; 6487 else if (Ty == LLT::fixed_vector(4, S16)) 6488 Opc = AArch64::ST2Twov4h; 6489 else if (Ty == LLT::fixed_vector(8, S16)) 6490 Opc = AArch64::ST2Twov8h; 6491 else if (Ty == LLT::fixed_vector(2, S32)) 6492 Opc = AArch64::ST2Twov2s; 6493 else if (Ty == LLT::fixed_vector(4, S32)) 6494 Opc = AArch64::ST2Twov4s; 6495 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6496 Opc = AArch64::ST2Twov2d; 6497 else if (Ty == S64 || Ty == P0) 6498 Opc = AArch64::ST1Twov1d; 6499 else 6500 llvm_unreachable("Unexpected type for st2!"); 6501 selectVectorStoreIntrinsic(I, 2, Opc); 6502 break; 6503 } 6504 case Intrinsic::aarch64_neon_st3: { 6505 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6506 unsigned Opc; 6507 if (Ty == LLT::fixed_vector(8, S8)) 6508 Opc = AArch64::ST3Threev8b; 6509 else if (Ty == LLT::fixed_vector(16, S8)) 6510 Opc = AArch64::ST3Threev16b; 6511 else if (Ty == LLT::fixed_vector(4, S16)) 6512 Opc = AArch64::ST3Threev4h; 6513 else if (Ty == LLT::fixed_vector(8, S16)) 6514 Opc = AArch64::ST3Threev8h; 6515 else if (Ty == LLT::fixed_vector(2, S32)) 6516 Opc = AArch64::ST3Threev2s; 6517 else if (Ty == LLT::fixed_vector(4, S32)) 6518 Opc = AArch64::ST3Threev4s; 6519 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6520 Opc = AArch64::ST3Threev2d; 6521 else if (Ty == S64 || Ty == P0) 6522 Opc = AArch64::ST1Threev1d; 6523 else 6524 llvm_unreachable("Unexpected type for st3!"); 6525 selectVectorStoreIntrinsic(I, 3, Opc); 6526 break; 6527 } 6528 case Intrinsic::aarch64_neon_st4: { 6529 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6530 unsigned Opc; 6531 if (Ty == LLT::fixed_vector(8, S8)) 6532 Opc = AArch64::ST4Fourv8b; 6533 else if (Ty == LLT::fixed_vector(16, S8)) 6534 Opc = AArch64::ST4Fourv16b; 6535 else if (Ty == LLT::fixed_vector(4, S16)) 6536 Opc = AArch64::ST4Fourv4h; 6537 else if (Ty == LLT::fixed_vector(8, S16)) 6538 Opc = AArch64::ST4Fourv8h; 6539 else if (Ty == LLT::fixed_vector(2, S32)) 6540 Opc = AArch64::ST4Fourv2s; 6541 else if (Ty == LLT::fixed_vector(4, S32)) 6542 Opc = AArch64::ST4Fourv4s; 6543 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) 6544 Opc = AArch64::ST4Fourv2d; 6545 else if (Ty == S64 || Ty == P0) 6546 Opc = AArch64::ST1Fourv1d; 6547 else 6548 llvm_unreachable("Unexpected type for st4!"); 6549 selectVectorStoreIntrinsic(I, 4, Opc); 6550 break; 6551 } 6552 case Intrinsic::aarch64_neon_st2lane: { 6553 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6554 unsigned Opc; 6555 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6556 Opc = AArch64::ST2i8; 6557 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6558 Opc = AArch64::ST2i16; 6559 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6560 Opc = AArch64::ST2i32; 6561 else if (Ty == LLT::fixed_vector(2, S64) || 6562 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6563 Opc = AArch64::ST2i64; 6564 else 6565 llvm_unreachable("Unexpected type for st2lane!"); 6566 if (!selectVectorStoreLaneIntrinsic(I, 2, Opc)) 6567 return false; 6568 break; 6569 } 6570 case Intrinsic::aarch64_neon_st3lane: { 6571 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6572 unsigned Opc; 6573 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6574 Opc = AArch64::ST3i8; 6575 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6576 Opc = AArch64::ST3i16; 6577 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6578 Opc = AArch64::ST3i32; 6579 else if (Ty == LLT::fixed_vector(2, S64) || 6580 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6581 Opc = AArch64::ST3i64; 6582 else 6583 llvm_unreachable("Unexpected type for st3lane!"); 6584 if (!selectVectorStoreLaneIntrinsic(I, 3, Opc)) 6585 return false; 6586 break; 6587 } 6588 case Intrinsic::aarch64_neon_st4lane: { 6589 LLT Ty = MRI.getType(I.getOperand(1).getReg()); 6590 unsigned Opc; 6591 if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) 6592 Opc = AArch64::ST4i8; 6593 else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) 6594 Opc = AArch64::ST4i16; 6595 else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) 6596 Opc = AArch64::ST4i32; 6597 else if (Ty == LLT::fixed_vector(2, S64) || 6598 Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) 6599 Opc = AArch64::ST4i64; 6600 else 6601 llvm_unreachable("Unexpected type for st4lane!"); 6602 if (!selectVectorStoreLaneIntrinsic(I, 4, Opc)) 6603 return false; 6604 break; 6605 } 6606 case Intrinsic::aarch64_mops_memset_tag: { 6607 // Transform 6608 // %dst:gpr(p0) = \ 6609 // G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag), 6610 // \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64) 6611 // where %dst is updated, into 6612 // %Rd:GPR64common, %Rn:GPR64) = \ 6613 // MOPSMemorySetTaggingPseudo \ 6614 // %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64 6615 // where Rd and Rn are tied. 6616 // It is expected that %val has been extended to s64 in legalization. 6617 // Note that the order of the size/value operands are swapped. 6618 6619 Register DstDef = I.getOperand(0).getReg(); 6620 // I.getOperand(1) is the intrinsic function 6621 Register DstUse = I.getOperand(2).getReg(); 6622 Register ValUse = I.getOperand(3).getReg(); 6623 Register SizeUse = I.getOperand(4).getReg(); 6624 6625 // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one. 6626 // Therefore an additional virtual register is requried for the updated size 6627 // operand. This value is not accessible via the semantics of the intrinsic. 6628 Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64)); 6629 6630 auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo, 6631 {DstDef, SizeDef}, {DstUse, SizeUse, ValUse}); 6632 Memset.cloneMemRefs(I); 6633 constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI); 6634 break; 6635 } 6636 } 6637 6638 I.eraseFromParent(); 6639 return true; 6640 } 6641 6642 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, 6643 MachineRegisterInfo &MRI) { 6644 unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID(); 6645 6646 switch (IntrinID) { 6647 default: 6648 break; 6649 case Intrinsic::aarch64_crypto_sha1h: { 6650 Register DstReg = I.getOperand(0).getReg(); 6651 Register SrcReg = I.getOperand(2).getReg(); 6652 6653 // FIXME: Should this be an assert? 6654 if (MRI.getType(DstReg).getSizeInBits() != 32 || 6655 MRI.getType(SrcReg).getSizeInBits() != 32) 6656 return false; 6657 6658 // The operation has to happen on FPRs. Set up some new FPR registers for 6659 // the source and destination if they are on GPRs. 6660 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { 6661 SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); 6662 MIB.buildCopy({SrcReg}, {I.getOperand(2)}); 6663 6664 // Make sure the copy ends up getting constrained properly. 6665 RBI.constrainGenericRegister(I.getOperand(2).getReg(), 6666 AArch64::GPR32RegClass, MRI); 6667 } 6668 6669 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) 6670 DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); 6671 6672 // Actually insert the instruction. 6673 auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); 6674 constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI); 6675 6676 // Did we create a new register for the destination? 6677 if (DstReg != I.getOperand(0).getReg()) { 6678 // Yep. Copy the result of the instruction back into the original 6679 // destination. 6680 MIB.buildCopy({I.getOperand(0)}, {DstReg}); 6681 RBI.constrainGenericRegister(I.getOperand(0).getReg(), 6682 AArch64::GPR32RegClass, MRI); 6683 } 6684 6685 I.eraseFromParent(); 6686 return true; 6687 } 6688 case Intrinsic::ptrauth_resign: { 6689 Register DstReg = I.getOperand(0).getReg(); 6690 Register ValReg = I.getOperand(2).getReg(); 6691 uint64_t AUTKey = I.getOperand(3).getImm(); 6692 Register AUTDisc = I.getOperand(4).getReg(); 6693 uint64_t PACKey = I.getOperand(5).getImm(); 6694 Register PACDisc = I.getOperand(6).getReg(); 6695 6696 Register AUTAddrDisc = AUTDisc; 6697 uint16_t AUTConstDiscC = 0; 6698 std::tie(AUTConstDiscC, AUTAddrDisc) = 6699 extractPtrauthBlendDiscriminators(AUTDisc, MRI); 6700 6701 Register PACAddrDisc = PACDisc; 6702 uint16_t PACConstDiscC = 0; 6703 std::tie(PACConstDiscC, PACAddrDisc) = 6704 extractPtrauthBlendDiscriminators(PACDisc, MRI); 6705 6706 MIB.buildCopy({AArch64::X16}, {ValReg}); 6707 MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {}); 6708 MIB.buildInstr(AArch64::AUTPAC) 6709 .addImm(AUTKey) 6710 .addImm(AUTConstDiscC) 6711 .addUse(AUTAddrDisc) 6712 .addImm(PACKey) 6713 .addImm(PACConstDiscC) 6714 .addUse(PACAddrDisc) 6715 .constrainAllUses(TII, TRI, RBI); 6716 MIB.buildCopy({DstReg}, Register(AArch64::X16)); 6717 6718 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 6719 I.eraseFromParent(); 6720 return true; 6721 } 6722 case Intrinsic::ptrauth_auth: { 6723 Register DstReg = I.getOperand(0).getReg(); 6724 Register ValReg = I.getOperand(2).getReg(); 6725 uint64_t AUTKey = I.getOperand(3).getImm(); 6726 Register AUTDisc = I.getOperand(4).getReg(); 6727 6728 Register AUTAddrDisc = AUTDisc; 6729 uint16_t AUTConstDiscC = 0; 6730 std::tie(AUTConstDiscC, AUTAddrDisc) = 6731 extractPtrauthBlendDiscriminators(AUTDisc, MRI); 6732 6733 MIB.buildCopy({AArch64::X16}, {ValReg}); 6734 MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {}); 6735 MIB.buildInstr(AArch64::AUT) 6736 .addImm(AUTKey) 6737 .addImm(AUTConstDiscC) 6738 .addUse(AUTAddrDisc) 6739 .constrainAllUses(TII, TRI, RBI); 6740 MIB.buildCopy({DstReg}, Register(AArch64::X16)); 6741 6742 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 6743 I.eraseFromParent(); 6744 return true; 6745 } 6746 case Intrinsic::frameaddress: 6747 case Intrinsic::returnaddress: { 6748 MachineFunction &MF = *I.getParent()->getParent(); 6749 MachineFrameInfo &MFI = MF.getFrameInfo(); 6750 6751 unsigned Depth = I.getOperand(2).getImm(); 6752 Register DstReg = I.getOperand(0).getReg(); 6753 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); 6754 6755 if (Depth == 0 && IntrinID == Intrinsic::returnaddress) { 6756 if (!MFReturnAddr) { 6757 // Insert the copy from LR/X30 into the entry block, before it can be 6758 // clobbered by anything. 6759 MFI.setReturnAddressIsTaken(true); 6760 MFReturnAddr = getFunctionLiveInPhysReg( 6761 MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc()); 6762 } 6763 6764 if (STI.hasPAuth()) { 6765 MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr}); 6766 } else { 6767 MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr}); 6768 MIB.buildInstr(AArch64::XPACLRI); 6769 MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); 6770 } 6771 6772 I.eraseFromParent(); 6773 return true; 6774 } 6775 6776 MFI.setFrameAddressIsTaken(true); 6777 Register FrameAddr(AArch64::FP); 6778 while (Depth--) { 6779 Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); 6780 auto Ldr = 6781 MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0); 6782 constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI); 6783 FrameAddr = NextFrame; 6784 } 6785 6786 if (IntrinID == Intrinsic::frameaddress) 6787 MIB.buildCopy({DstReg}, {FrameAddr}); 6788 else { 6789 MFI.setReturnAddressIsTaken(true); 6790 6791 if (STI.hasPAuth()) { 6792 Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 6793 MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1); 6794 MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg}); 6795 } else { 6796 MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}) 6797 .addImm(1); 6798 MIB.buildInstr(AArch64::XPACLRI); 6799 MIB.buildCopy({DstReg}, {Register(AArch64::LR)}); 6800 } 6801 } 6802 6803 I.eraseFromParent(); 6804 return true; 6805 } 6806 case Intrinsic::aarch64_neon_tbl2: 6807 SelectTable(I, MRI, 2, AArch64::TBLv8i8Two, AArch64::TBLv16i8Two, false); 6808 return true; 6809 case Intrinsic::aarch64_neon_tbl3: 6810 SelectTable(I, MRI, 3, AArch64::TBLv8i8Three, AArch64::TBLv16i8Three, 6811 false); 6812 return true; 6813 case Intrinsic::aarch64_neon_tbl4: 6814 SelectTable(I, MRI, 4, AArch64::TBLv8i8Four, AArch64::TBLv16i8Four, false); 6815 return true; 6816 case Intrinsic::aarch64_neon_tbx2: 6817 SelectTable(I, MRI, 2, AArch64::TBXv8i8Two, AArch64::TBXv16i8Two, true); 6818 return true; 6819 case Intrinsic::aarch64_neon_tbx3: 6820 SelectTable(I, MRI, 3, AArch64::TBXv8i8Three, AArch64::TBXv16i8Three, true); 6821 return true; 6822 case Intrinsic::aarch64_neon_tbx4: 6823 SelectTable(I, MRI, 4, AArch64::TBXv8i8Four, AArch64::TBXv16i8Four, true); 6824 return true; 6825 case Intrinsic::swift_async_context_addr: 6826 auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()}, 6827 {Register(AArch64::FP)}) 6828 .addImm(8) 6829 .addImm(0); 6830 constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI); 6831 6832 MF->getFrameInfo().setFrameAddressIsTaken(true); 6833 MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); 6834 I.eraseFromParent(); 6835 return true; 6836 } 6837 return false; 6838 } 6839 6840 // G_PTRAUTH_GLOBAL_VALUE lowering 6841 // 6842 // We have 3 lowering alternatives to choose from: 6843 // - MOVaddrPAC: similar to MOVaddr, with added PAC. 6844 // If the GV doesn't need a GOT load (i.e., is locally defined) 6845 // materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC. 6846 // 6847 // - LOADgotPAC: similar to LOADgot, with added PAC. 6848 // If the GV needs a GOT load, materialize the pointer using the usual 6849 // GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT 6850 // section is assumed to be read-only (for example, via relro mechanism). See 6851 // LowerMOVaddrPAC. 6852 // 6853 // - LOADauthptrstatic: similar to LOADgot, but use a 6854 // special stub slot instead of a GOT slot. 6855 // Load a signed pointer for symbol 'sym' from a stub slot named 6856 // 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation 6857 // resolving. This usually lowers to adrp+ldr, but also emits an entry into 6858 // .data with an 6859 // @AUTH relocation. See LowerLOADauthptrstatic. 6860 // 6861 // All 3 are pseudos that are expand late to longer sequences: this lets us 6862 // provide integrity guarantees on the to-be-signed intermediate values. 6863 // 6864 // LOADauthptrstatic is undesirable because it requires a large section filled 6865 // with often similarly-signed pointers, making it a good harvesting target. 6866 // Thus, it's only used for ptrauth references to extern_weak to avoid null 6867 // checks. 6868 6869 bool AArch64InstructionSelector::selectPtrAuthGlobalValue( 6870 MachineInstr &I, MachineRegisterInfo &MRI) const { 6871 Register DefReg = I.getOperand(0).getReg(); 6872 Register Addr = I.getOperand(1).getReg(); 6873 uint64_t Key = I.getOperand(2).getImm(); 6874 Register AddrDisc = I.getOperand(3).getReg(); 6875 uint64_t Disc = I.getOperand(4).getImm(); 6876 int64_t Offset = 0; 6877 6878 if (Key > AArch64PACKey::LAST) 6879 report_fatal_error("key in ptrauth global out of range [0, " + 6880 Twine((int)AArch64PACKey::LAST) + "]"); 6881 6882 // Blend only works if the integer discriminator is 16-bit wide. 6883 if (!isUInt<16>(Disc)) 6884 report_fatal_error( 6885 "constant discriminator in ptrauth global out of range [0, 0xffff]"); 6886 6887 // Choosing between 3 lowering alternatives is target-specific. 6888 if (!STI.isTargetELF() && !STI.isTargetMachO()) 6889 report_fatal_error("ptrauth global lowering only supported on MachO/ELF"); 6890 6891 if (!MRI.hasOneDef(Addr)) 6892 return false; 6893 6894 // First match any offset we take from the real global. 6895 const MachineInstr *DefMI = &*MRI.def_instr_begin(Addr); 6896 if (DefMI->getOpcode() == TargetOpcode::G_PTR_ADD) { 6897 Register OffsetReg = DefMI->getOperand(2).getReg(); 6898 if (!MRI.hasOneDef(OffsetReg)) 6899 return false; 6900 const MachineInstr &OffsetMI = *MRI.def_instr_begin(OffsetReg); 6901 if (OffsetMI.getOpcode() != TargetOpcode::G_CONSTANT) 6902 return false; 6903 6904 Addr = DefMI->getOperand(1).getReg(); 6905 if (!MRI.hasOneDef(Addr)) 6906 return false; 6907 6908 DefMI = &*MRI.def_instr_begin(Addr); 6909 Offset = OffsetMI.getOperand(1).getCImm()->getSExtValue(); 6910 } 6911 6912 // We should be left with a genuine unauthenticated GlobalValue. 6913 const GlobalValue *GV; 6914 if (DefMI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) { 6915 GV = DefMI->getOperand(1).getGlobal(); 6916 Offset += DefMI->getOperand(1).getOffset(); 6917 } else if (DefMI->getOpcode() == AArch64::G_ADD_LOW) { 6918 GV = DefMI->getOperand(2).getGlobal(); 6919 Offset += DefMI->getOperand(2).getOffset(); 6920 } else { 6921 return false; 6922 } 6923 6924 MachineIRBuilder MIB(I); 6925 6926 // Classify the reference to determine whether it needs a GOT load. 6927 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM); 6928 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0); 6929 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) && 6930 "unsupported non-GOT op flags on ptrauth global reference"); 6931 assert((!GV->hasExternalWeakLinkage() || NeedsGOTLoad) && 6932 "unsupported non-GOT reference to weak ptrauth global"); 6933 6934 std::optional<APInt> AddrDiscVal = getIConstantVRegVal(AddrDisc, MRI); 6935 bool HasAddrDisc = !AddrDiscVal || *AddrDiscVal != 0; 6936 6937 // Non-extern_weak: 6938 // - No GOT load needed -> MOVaddrPAC 6939 // - GOT load for non-extern_weak -> LOADgotPAC 6940 // Note that we disallow extern_weak refs to avoid null checks later. 6941 if (!GV->hasExternalWeakLinkage()) { 6942 MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {}); 6943 MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {}); 6944 MIB.buildInstr(NeedsGOTLoad ? AArch64::LOADgotPAC : AArch64::MOVaddrPAC) 6945 .addGlobalAddress(GV, Offset) 6946 .addImm(Key) 6947 .addReg(HasAddrDisc ? AddrDisc : AArch64::XZR) 6948 .addImm(Disc) 6949 .constrainAllUses(TII, TRI, RBI); 6950 MIB.buildCopy(DefReg, Register(AArch64::X16)); 6951 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 6952 I.eraseFromParent(); 6953 return true; 6954 } 6955 6956 // extern_weak -> LOADauthptrstatic 6957 6958 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the 6959 // offset alone as a pointer if the symbol wasn't available, which would 6960 // probably break null checks in users. Ptrauth complicates things further: 6961 // error out. 6962 if (Offset != 0) 6963 report_fatal_error( 6964 "unsupported non-zero offset in weak ptrauth global reference"); 6965 6966 if (HasAddrDisc) 6967 report_fatal_error("unsupported weak addr-div ptrauth global"); 6968 6969 MIB.buildInstr(AArch64::LOADauthptrstatic, {DefReg}, {}) 6970 .addGlobalAddress(GV, Offset) 6971 .addImm(Key) 6972 .addImm(Disc); 6973 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); 6974 6975 I.eraseFromParent(); 6976 return true; 6977 } 6978 6979 void AArch64InstructionSelector::SelectTable(MachineInstr &I, 6980 MachineRegisterInfo &MRI, 6981 unsigned NumVec, unsigned Opc1, 6982 unsigned Opc2, bool isExt) { 6983 Register DstReg = I.getOperand(0).getReg(); 6984 unsigned Opc = MRI.getType(DstReg) == LLT::fixed_vector(8, 8) ? Opc1 : Opc2; 6985 6986 // Create the REG_SEQUENCE 6987 SmallVector<Register, 4> Regs; 6988 for (unsigned i = 0; i < NumVec; i++) 6989 Regs.push_back(I.getOperand(i + 2 + isExt).getReg()); 6990 Register RegSeq = createQTuple(Regs, MIB); 6991 6992 Register IdxReg = I.getOperand(2 + NumVec + isExt).getReg(); 6993 MachineInstrBuilder Instr; 6994 if (isExt) { 6995 Register Reg = I.getOperand(2).getReg(); 6996 Instr = MIB.buildInstr(Opc, {DstReg}, {Reg, RegSeq, IdxReg}); 6997 } else 6998 Instr = MIB.buildInstr(Opc, {DstReg}, {RegSeq, IdxReg}); 6999 constrainSelectedInstRegOperands(*Instr, TII, TRI, RBI); 7000 I.eraseFromParent(); 7001 } 7002 7003 InstructionSelector::ComplexRendererFns 7004 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const { 7005 auto MaybeImmed = getImmedFromMO(Root); 7006 if (MaybeImmed == std::nullopt || *MaybeImmed > 31) 7007 return std::nullopt; 7008 uint64_t Enc = (32 - *MaybeImmed) & 0x1f; 7009 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 7010 } 7011 7012 InstructionSelector::ComplexRendererFns 7013 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const { 7014 auto MaybeImmed = getImmedFromMO(Root); 7015 if (MaybeImmed == std::nullopt || *MaybeImmed > 31) 7016 return std::nullopt; 7017 uint64_t Enc = 31 - *MaybeImmed; 7018 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 7019 } 7020 7021 InstructionSelector::ComplexRendererFns 7022 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const { 7023 auto MaybeImmed = getImmedFromMO(Root); 7024 if (MaybeImmed == std::nullopt || *MaybeImmed > 63) 7025 return std::nullopt; 7026 uint64_t Enc = (64 - *MaybeImmed) & 0x3f; 7027 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 7028 } 7029 7030 InstructionSelector::ComplexRendererFns 7031 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const { 7032 auto MaybeImmed = getImmedFromMO(Root); 7033 if (MaybeImmed == std::nullopt || *MaybeImmed > 63) 7034 return std::nullopt; 7035 uint64_t Enc = 63 - *MaybeImmed; 7036 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; 7037 } 7038 7039 /// Helper to select an immediate value that can be represented as a 12-bit 7040 /// value shifted left by either 0 or 12. If it is possible to do so, return 7041 /// the immediate and shift value. If not, return std::nullopt. 7042 /// 7043 /// Used by selectArithImmed and selectNegArithImmed. 7044 InstructionSelector::ComplexRendererFns 7045 AArch64InstructionSelector::select12BitValueWithLeftShift( 7046 uint64_t Immed) const { 7047 unsigned ShiftAmt; 7048 if (Immed >> 12 == 0) { 7049 ShiftAmt = 0; 7050 } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { 7051 ShiftAmt = 12; 7052 Immed = Immed >> 12; 7053 } else 7054 return std::nullopt; 7055 7056 unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); 7057 return {{ 7058 [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, 7059 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, 7060 }}; 7061 } 7062 7063 /// SelectArithImmed - Select an immediate value that can be represented as 7064 /// a 12-bit value shifted left by either 0 or 12. If so, return true with 7065 /// Val set to the 12-bit value and Shift set to the shifter operand. 7066 InstructionSelector::ComplexRendererFns 7067 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { 7068 // This function is called from the addsub_shifted_imm ComplexPattern, 7069 // which lists [imm] as the list of opcode it's interested in, however 7070 // we still need to check whether the operand is actually an immediate 7071 // here because the ComplexPattern opcode list is only used in 7072 // root-level opcode matching. 7073 auto MaybeImmed = getImmedFromMO(Root); 7074 if (MaybeImmed == std::nullopt) 7075 return std::nullopt; 7076 return select12BitValueWithLeftShift(*MaybeImmed); 7077 } 7078 7079 /// SelectNegArithImmed - As above, but negates the value before trying to 7080 /// select it. 7081 InstructionSelector::ComplexRendererFns 7082 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const { 7083 // We need a register here, because we need to know if we have a 64 or 32 7084 // bit immediate. 7085 if (!Root.isReg()) 7086 return std::nullopt; 7087 auto MaybeImmed = getImmedFromMO(Root); 7088 if (MaybeImmed == std::nullopt) 7089 return std::nullopt; 7090 uint64_t Immed = *MaybeImmed; 7091 7092 // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" 7093 // have the opposite effect on the C flag, so this pattern mustn't match under 7094 // those circumstances. 7095 if (Immed == 0) 7096 return std::nullopt; 7097 7098 // Check if we're dealing with a 32-bit type on the root or a 64-bit type on 7099 // the root. 7100 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7101 if (MRI.getType(Root.getReg()).getSizeInBits() == 32) 7102 Immed = ~((uint32_t)Immed) + 1; 7103 else 7104 Immed = ~Immed + 1ULL; 7105 7106 if (Immed & 0xFFFFFFFFFF000000ULL) 7107 return std::nullopt; 7108 7109 Immed &= 0xFFFFFFULL; 7110 return select12BitValueWithLeftShift(Immed); 7111 } 7112 7113 /// Checks if we are sure that folding MI into load/store addressing mode is 7114 /// beneficial or not. 7115 /// 7116 /// Returns: 7117 /// - true if folding MI would be beneficial. 7118 /// - false if folding MI would be bad. 7119 /// - std::nullopt if it is not sure whether folding MI is beneficial. 7120 /// 7121 /// \p MI can be the offset operand of G_PTR_ADD, e.g. G_SHL in the example: 7122 /// 7123 /// %13:gpr(s64) = G_CONSTANT i64 1 7124 /// %8:gpr(s64) = G_SHL %6, %13(s64) 7125 /// %9:gpr(p0) = G_PTR_ADD %0, %8(s64) 7126 /// %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16)) 7127 std::optional<bool> AArch64InstructionSelector::isWorthFoldingIntoAddrMode( 7128 MachineInstr &MI, const MachineRegisterInfo &MRI) const { 7129 if (MI.getOpcode() == AArch64::G_SHL) { 7130 // Address operands with shifts are free, except for running on subtargets 7131 // with AddrLSLSlow14. 7132 if (const auto ValAndVeg = getIConstantVRegValWithLookThrough( 7133 MI.getOperand(2).getReg(), MRI)) { 7134 const APInt ShiftVal = ValAndVeg->Value; 7135 7136 // Don't fold if we know this will be slow. 7137 return !(STI.hasAddrLSLSlow14() && (ShiftVal == 1 || ShiftVal == 4)); 7138 } 7139 } 7140 return std::nullopt; 7141 } 7142 7143 /// Return true if it is worth folding MI into an extended register. That is, 7144 /// if it's safe to pull it into the addressing mode of a load or store as a 7145 /// shift. 7146 /// \p IsAddrOperand whether the def of MI is used as an address operand 7147 /// (e.g. feeding into an LDR/STR). 7148 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( 7149 MachineInstr &MI, const MachineRegisterInfo &MRI, 7150 bool IsAddrOperand) const { 7151 7152 // Always fold if there is one use, or if we're optimizing for size. 7153 Register DefReg = MI.getOperand(0).getReg(); 7154 if (MRI.hasOneNonDBGUse(DefReg) || 7155 MI.getParent()->getParent()->getFunction().hasOptSize()) 7156 return true; 7157 7158 if (IsAddrOperand) { 7159 // If we are already sure that folding MI is good or bad, return the result. 7160 if (const auto Worth = isWorthFoldingIntoAddrMode(MI, MRI)) 7161 return *Worth; 7162 7163 // Fold G_PTR_ADD if its offset operand can be folded 7164 if (MI.getOpcode() == AArch64::G_PTR_ADD) { 7165 MachineInstr *OffsetInst = 7166 getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); 7167 7168 // Note, we already know G_PTR_ADD is used by at least two instructions. 7169 // If we are also sure about whether folding is beneficial or not, 7170 // return the result. 7171 if (const auto Worth = isWorthFoldingIntoAddrMode(*OffsetInst, MRI)) 7172 return *Worth; 7173 } 7174 } 7175 7176 // FIXME: Consider checking HasALULSLFast as appropriate. 7177 7178 // We have a fastpath, so folding a shift in and potentially computing it 7179 // many times may be beneficial. Check if this is only used in memory ops. 7180 // If it is, then we should fold. 7181 return all_of(MRI.use_nodbg_instructions(DefReg), 7182 [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); 7183 } 7184 7185 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) { 7186 switch (Type) { 7187 case AArch64_AM::SXTB: 7188 case AArch64_AM::SXTH: 7189 case AArch64_AM::SXTW: 7190 return true; 7191 default: 7192 return false; 7193 } 7194 } 7195 7196 InstructionSelector::ComplexRendererFns 7197 AArch64InstructionSelector::selectExtendedSHL( 7198 MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset, 7199 unsigned SizeInBytes, bool WantsExt) const { 7200 assert(Base.isReg() && "Expected base to be a register operand"); 7201 assert(Offset.isReg() && "Expected offset to be a register operand"); 7202 7203 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7204 MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg()); 7205 7206 unsigned OffsetOpc = OffsetInst->getOpcode(); 7207 bool LookedThroughZExt = false; 7208 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) { 7209 // Try to look through a ZEXT. 7210 if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt) 7211 return std::nullopt; 7212 7213 OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg()); 7214 OffsetOpc = OffsetInst->getOpcode(); 7215 LookedThroughZExt = true; 7216 7217 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) 7218 return std::nullopt; 7219 } 7220 // Make sure that the memory op is a valid size. 7221 int64_t LegalShiftVal = Log2_32(SizeInBytes); 7222 if (LegalShiftVal == 0) 7223 return std::nullopt; 7224 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true)) 7225 return std::nullopt; 7226 7227 // Now, try to find the specific G_CONSTANT. Start by assuming that the 7228 // register we will offset is the LHS, and the register containing the 7229 // constant is the RHS. 7230 Register OffsetReg = OffsetInst->getOperand(1).getReg(); 7231 Register ConstantReg = OffsetInst->getOperand(2).getReg(); 7232 auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 7233 if (!ValAndVReg) { 7234 // We didn't get a constant on the RHS. If the opcode is a shift, then 7235 // we're done. 7236 if (OffsetOpc == TargetOpcode::G_SHL) 7237 return std::nullopt; 7238 7239 // If we have a G_MUL, we can use either register. Try looking at the RHS. 7240 std::swap(OffsetReg, ConstantReg); 7241 ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI); 7242 if (!ValAndVReg) 7243 return std::nullopt; 7244 } 7245 7246 // The value must fit into 3 bits, and must be positive. Make sure that is 7247 // true. 7248 int64_t ImmVal = ValAndVReg->Value.getSExtValue(); 7249 7250 // Since we're going to pull this into a shift, the constant value must be 7251 // a power of 2. If we got a multiply, then we need to check this. 7252 if (OffsetOpc == TargetOpcode::G_MUL) { 7253 if (!llvm::has_single_bit<uint32_t>(ImmVal)) 7254 return std::nullopt; 7255 7256 // Got a power of 2. So, the amount we'll shift is the log base-2 of that. 7257 ImmVal = Log2_32(ImmVal); 7258 } 7259 7260 if ((ImmVal & 0x7) != ImmVal) 7261 return std::nullopt; 7262 7263 // We are only allowed to shift by LegalShiftVal. This shift value is built 7264 // into the instruction, so we can't just use whatever we want. 7265 if (ImmVal != LegalShiftVal) 7266 return std::nullopt; 7267 7268 unsigned SignExtend = 0; 7269 if (WantsExt) { 7270 // Check if the offset is defined by an extend, unless we looked through a 7271 // G_ZEXT earlier. 7272 if (!LookedThroughZExt) { 7273 MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI); 7274 auto Ext = getExtendTypeForInst(*ExtInst, MRI, true); 7275 if (Ext == AArch64_AM::InvalidShiftExtend) 7276 return std::nullopt; 7277 7278 SignExtend = isSignExtendShiftType(Ext) ? 1 : 0; 7279 // We only support SXTW for signed extension here. 7280 if (SignExtend && Ext != AArch64_AM::SXTW) 7281 return std::nullopt; 7282 OffsetReg = ExtInst->getOperand(1).getReg(); 7283 } 7284 7285 // Need a 32-bit wide register here. 7286 MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg())); 7287 OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB); 7288 } 7289 7290 // We can use the LHS of the GEP as the base, and the LHS of the shift as an 7291 // offset. Signify that we are shifting by setting the shift flag to 1. 7292 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); }, 7293 [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); }, 7294 [=](MachineInstrBuilder &MIB) { 7295 // Need to add both immediates here to make sure that they are both 7296 // added to the instruction. 7297 MIB.addImm(SignExtend); 7298 MIB.addImm(1); 7299 }}}; 7300 } 7301 7302 /// This is used for computing addresses like this: 7303 /// 7304 /// ldr x1, [x2, x3, lsl #3] 7305 /// 7306 /// Where x2 is the base register, and x3 is an offset register. The shift-left 7307 /// is a constant value specific to this load instruction. That is, we'll never 7308 /// see anything other than a 3 here (which corresponds to the size of the 7309 /// element being loaded.) 7310 InstructionSelector::ComplexRendererFns 7311 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( 7312 MachineOperand &Root, unsigned SizeInBytes) const { 7313 if (!Root.isReg()) 7314 return std::nullopt; 7315 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7316 7317 // We want to find something like this: 7318 // 7319 // val = G_CONSTANT LegalShiftVal 7320 // shift = G_SHL off_reg val 7321 // ptr = G_PTR_ADD base_reg shift 7322 // x = G_LOAD ptr 7323 // 7324 // And fold it into this addressing mode: 7325 // 7326 // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] 7327 7328 // Check if we can find the G_PTR_ADD. 7329 MachineInstr *PtrAdd = 7330 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 7331 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true)) 7332 return std::nullopt; 7333 7334 // Now, try to match an opcode which will match our specific offset. 7335 // We want a G_SHL or a G_MUL. 7336 MachineInstr *OffsetInst = 7337 getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI); 7338 return selectExtendedSHL(Root, PtrAdd->getOperand(1), 7339 OffsetInst->getOperand(0), SizeInBytes, 7340 /*WantsExt=*/false); 7341 } 7342 7343 /// This is used for computing addresses like this: 7344 /// 7345 /// ldr x1, [x2, x3] 7346 /// 7347 /// Where x2 is the base register, and x3 is an offset register. 7348 /// 7349 /// When possible (or profitable) to fold a G_PTR_ADD into the address 7350 /// calculation, this will do so. Otherwise, it will return std::nullopt. 7351 InstructionSelector::ComplexRendererFns 7352 AArch64InstructionSelector::selectAddrModeRegisterOffset( 7353 MachineOperand &Root) const { 7354 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7355 7356 // We need a GEP. 7357 MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); 7358 if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD) 7359 return std::nullopt; 7360 7361 // If this is used more than once, let's not bother folding. 7362 // TODO: Check if they are memory ops. If they are, then we can still fold 7363 // without having to recompute anything. 7364 if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg())) 7365 return std::nullopt; 7366 7367 // Base is the GEP's LHS, offset is its RHS. 7368 return {{[=](MachineInstrBuilder &MIB) { 7369 MIB.addUse(Gep->getOperand(1).getReg()); 7370 }, 7371 [=](MachineInstrBuilder &MIB) { 7372 MIB.addUse(Gep->getOperand(2).getReg()); 7373 }, 7374 [=](MachineInstrBuilder &MIB) { 7375 // Need to add both immediates here to make sure that they are both 7376 // added to the instruction. 7377 MIB.addImm(0); 7378 MIB.addImm(0); 7379 }}}; 7380 } 7381 7382 /// This is intended to be equivalent to selectAddrModeXRO in 7383 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads. 7384 InstructionSelector::ComplexRendererFns 7385 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, 7386 unsigned SizeInBytes) const { 7387 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7388 if (!Root.isReg()) 7389 return std::nullopt; 7390 MachineInstr *PtrAdd = 7391 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 7392 if (!PtrAdd) 7393 return std::nullopt; 7394 7395 // Check for an immediates which cannot be encoded in the [base + imm] 7396 // addressing mode, and can't be encoded in an add/sub. If this happens, we'll 7397 // end up with code like: 7398 // 7399 // mov x0, wide 7400 // add x1 base, x0 7401 // ldr x2, [x1, x0] 7402 // 7403 // In this situation, we can use the [base, xreg] addressing mode to save an 7404 // add/sub: 7405 // 7406 // mov x0, wide 7407 // ldr x2, [base, x0] 7408 auto ValAndVReg = 7409 getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI); 7410 if (ValAndVReg) { 7411 unsigned Scale = Log2_32(SizeInBytes); 7412 int64_t ImmOff = ValAndVReg->Value.getSExtValue(); 7413 7414 // Skip immediates that can be selected in the load/store addresing 7415 // mode. 7416 if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 && 7417 ImmOff < (0x1000 << Scale)) 7418 return std::nullopt; 7419 7420 // Helper lambda to decide whether or not it is preferable to emit an add. 7421 auto isPreferredADD = [](int64_t ImmOff) { 7422 // Constants in [0x0, 0xfff] can be encoded in an add. 7423 if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL) 7424 return true; 7425 7426 // Can it be encoded in an add lsl #12? 7427 if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL) 7428 return false; 7429 7430 // It can be encoded in an add lsl #12, but we may not want to. If it is 7431 // possible to select this as a single movz, then prefer that. A single 7432 // movz is faster than an add with a shift. 7433 return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL && 7434 (ImmOff & 0xffffffffffff0fffLL) != 0x0LL; 7435 }; 7436 7437 // If the immediate can be encoded in a single add/sub, then bail out. 7438 if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff)) 7439 return std::nullopt; 7440 } 7441 7442 // Try to fold shifts into the addressing mode. 7443 auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); 7444 if (AddrModeFns) 7445 return AddrModeFns; 7446 7447 // If that doesn't work, see if it's possible to fold in registers from 7448 // a GEP. 7449 return selectAddrModeRegisterOffset(Root); 7450 } 7451 7452 /// This is used for computing addresses like this: 7453 /// 7454 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal] 7455 /// 7456 /// Where we have a 64-bit base register, a 32-bit offset register, and an 7457 /// extend (which may or may not be signed). 7458 InstructionSelector::ComplexRendererFns 7459 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root, 7460 unsigned SizeInBytes) const { 7461 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 7462 7463 MachineInstr *PtrAdd = 7464 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); 7465 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true)) 7466 return std::nullopt; 7467 7468 MachineOperand &LHS = PtrAdd->getOperand(1); 7469 MachineOperand &RHS = PtrAdd->getOperand(2); 7470 MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI); 7471 7472 // The first case is the same as selectAddrModeXRO, except we need an extend. 7473 // In this case, we try to find a shift and extend, and fold them into the 7474 // addressing mode. 7475 // 7476 // E.g. 7477 // 7478 // off_reg = G_Z/S/ANYEXT ext_reg 7479 // val = G_CONSTANT LegalShiftVal 7480 // shift = G_SHL off_reg val 7481 // ptr = G_PTR_ADD base_reg shift 7482 // x = G_LOAD ptr 7483 // 7484 // In this case we can get a load like this: 7485 // 7486 // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal] 7487 auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0), 7488 SizeInBytes, /*WantsExt=*/true); 7489 if (ExtendedShl) 7490 return ExtendedShl; 7491 7492 // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though. 7493 // 7494 // e.g. 7495 // ldr something, [base_reg, ext_reg, sxtw] 7496 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true)) 7497 return std::nullopt; 7498 7499 // Check if this is an extend. We'll get an extend type if it is. 7500 AArch64_AM::ShiftExtendType Ext = 7501 getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true); 7502 if (Ext == AArch64_AM::InvalidShiftExtend) 7503 return std::nullopt; 7504 7505 // Need a 32-bit wide register. 7506 MachineIRBuilder MIB(*PtrAdd); 7507 Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(), 7508 AArch64::GPR32RegClass, MIB); 7509 unsigned SignExtend = Ext == AArch64_AM::SXTW; 7510 7511 // Base is LHS, offset is ExtReg. 7512 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); }, 7513 [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, 7514 [=](MachineInstrBuilder &MIB) { 7515 MIB.addImm(SignExtend); 7516 MIB.addImm(0); 7517 }}}; 7518 } 7519 7520 /// Select a "register plus unscaled signed 9-bit immediate" address. This 7521 /// should only match when there is an offset that is not valid for a scaled 7522 /// immediate addressing mode. The "Size" argument is the size in bytes of the 7523 /// memory reference, which is needed here to know what is valid for a scaled 7524 /// immediate. 7525 InstructionSelector::ComplexRendererFns 7526 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root, 7527 unsigned Size) const { 7528 MachineRegisterInfo &MRI = 7529 Root.getParent()->getParent()->getParent()->getRegInfo(); 7530 7531 if (!Root.isReg()) 7532 return std::nullopt; 7533 7534 if (!isBaseWithConstantOffset(Root, MRI)) 7535 return std::nullopt; 7536 7537 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); 7538 7539 MachineOperand &OffImm = RootDef->getOperand(2); 7540 if (!OffImm.isReg()) 7541 return std::nullopt; 7542 MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg()); 7543 if (RHS->getOpcode() != TargetOpcode::G_CONSTANT) 7544 return std::nullopt; 7545 int64_t RHSC; 7546 MachineOperand &RHSOp1 = RHS->getOperand(1); 7547 if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64) 7548 return std::nullopt; 7549 RHSC = RHSOp1.getCImm()->getSExtValue(); 7550 7551 if (RHSC >= -256 && RHSC < 256) { 7552 MachineOperand &Base = RootDef->getOperand(1); 7553 return {{ 7554 [=](MachineInstrBuilder &MIB) { MIB.add(Base); }, 7555 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); }, 7556 }}; 7557 } 7558 return std::nullopt; 7559 } 7560 7561 InstructionSelector::ComplexRendererFns 7562 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, 7563 unsigned Size, 7564 MachineRegisterInfo &MRI) const { 7565 if (RootDef.getOpcode() != AArch64::G_ADD_LOW) 7566 return std::nullopt; 7567 MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg()); 7568 if (Adrp.getOpcode() != AArch64::ADRP) 7569 return std::nullopt; 7570 7571 // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG. 7572 auto Offset = Adrp.getOperand(1).getOffset(); 7573 if (Offset % Size != 0) 7574 return std::nullopt; 7575 7576 auto GV = Adrp.getOperand(1).getGlobal(); 7577 if (GV->isThreadLocal()) 7578 return std::nullopt; 7579 7580 auto &MF = *RootDef.getParent()->getParent(); 7581 if (GV->getPointerAlignment(MF.getDataLayout()) < Size) 7582 return std::nullopt; 7583 7584 unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget()); 7585 MachineIRBuilder MIRBuilder(RootDef); 7586 Register AdrpReg = Adrp.getOperand(0).getReg(); 7587 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); }, 7588 [=](MachineInstrBuilder &MIB) { 7589 MIB.addGlobalAddress(GV, Offset, 7590 OpFlags | AArch64II::MO_PAGEOFF | 7591 AArch64II::MO_NC); 7592 }}}; 7593 } 7594 7595 /// Select a "register plus scaled unsigned 12-bit immediate" address. The 7596 /// "Size" argument is the size in bytes of the memory reference, which 7597 /// determines the scale. 7598 InstructionSelector::ComplexRendererFns 7599 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, 7600 unsigned Size) const { 7601 MachineFunction &MF = *Root.getParent()->getParent()->getParent(); 7602 MachineRegisterInfo &MRI = MF.getRegInfo(); 7603 7604 if (!Root.isReg()) 7605 return std::nullopt; 7606 7607 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); 7608 if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) { 7609 return {{ 7610 [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); }, 7611 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 7612 }}; 7613 } 7614 7615 CodeModel::Model CM = MF.getTarget().getCodeModel(); 7616 // Check if we can fold in the ADD of small code model ADRP + ADD address. 7617 if (CM == CodeModel::Small) { 7618 auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI); 7619 if (OpFns) 7620 return OpFns; 7621 } 7622 7623 if (isBaseWithConstantOffset(Root, MRI)) { 7624 MachineOperand &LHS = RootDef->getOperand(1); 7625 MachineOperand &RHS = RootDef->getOperand(2); 7626 MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); 7627 MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); 7628 7629 int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue(); 7630 unsigned Scale = Log2_32(Size); 7631 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { 7632 if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) 7633 return {{ 7634 [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); }, 7635 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, 7636 }}; 7637 7638 return {{ 7639 [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, 7640 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, 7641 }}; 7642 } 7643 } 7644 7645 // Before falling back to our general case, check if the unscaled 7646 // instructions can handle this. If so, that's preferable. 7647 if (selectAddrModeUnscaled(Root, Size)) 7648 return std::nullopt; 7649 7650 return {{ 7651 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 7652 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, 7653 }}; 7654 } 7655 7656 /// Given a shift instruction, return the correct shift type for that 7657 /// instruction. 7658 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) { 7659 switch (MI.getOpcode()) { 7660 default: 7661 return AArch64_AM::InvalidShiftExtend; 7662 case TargetOpcode::G_SHL: 7663 return AArch64_AM::LSL; 7664 case TargetOpcode::G_LSHR: 7665 return AArch64_AM::LSR; 7666 case TargetOpcode::G_ASHR: 7667 return AArch64_AM::ASR; 7668 case TargetOpcode::G_ROTR: 7669 return AArch64_AM::ROR; 7670 } 7671 } 7672 7673 /// Select a "shifted register" operand. If the value is not shifted, set the 7674 /// shift operand to a default value of "lsl 0". 7675 InstructionSelector::ComplexRendererFns 7676 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root, 7677 bool AllowROR) const { 7678 if (!Root.isReg()) 7679 return std::nullopt; 7680 MachineRegisterInfo &MRI = 7681 Root.getParent()->getParent()->getParent()->getRegInfo(); 7682 7683 // Check if the operand is defined by an instruction which corresponds to 7684 // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc. 7685 MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg()); 7686 AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst); 7687 if (ShType == AArch64_AM::InvalidShiftExtend) 7688 return std::nullopt; 7689 if (ShType == AArch64_AM::ROR && !AllowROR) 7690 return std::nullopt; 7691 if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI, false)) 7692 return std::nullopt; 7693 7694 // Need an immediate on the RHS. 7695 MachineOperand &ShiftRHS = ShiftInst->getOperand(2); 7696 auto Immed = getImmedFromMO(ShiftRHS); 7697 if (!Immed) 7698 return std::nullopt; 7699 7700 // We have something that we can fold. Fold in the shift's LHS and RHS into 7701 // the instruction. 7702 MachineOperand &ShiftLHS = ShiftInst->getOperand(1); 7703 Register ShiftReg = ShiftLHS.getReg(); 7704 7705 unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits(); 7706 unsigned Val = *Immed & (NumBits - 1); 7707 unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val); 7708 7709 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); }, 7710 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}}; 7711 } 7712 7713 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( 7714 MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const { 7715 unsigned Opc = MI.getOpcode(); 7716 7717 // Handle explicit extend instructions first. 7718 if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) { 7719 unsigned Size; 7720 if (Opc == TargetOpcode::G_SEXT) 7721 Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 7722 else 7723 Size = MI.getOperand(2).getImm(); 7724 assert(Size != 64 && "Extend from 64 bits?"); 7725 switch (Size) { 7726 case 8: 7727 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB; 7728 case 16: 7729 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH; 7730 case 32: 7731 return AArch64_AM::SXTW; 7732 default: 7733 return AArch64_AM::InvalidShiftExtend; 7734 } 7735 } 7736 7737 if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) { 7738 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 7739 assert(Size != 64 && "Extend from 64 bits?"); 7740 switch (Size) { 7741 case 8: 7742 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB; 7743 case 16: 7744 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH; 7745 case 32: 7746 return AArch64_AM::UXTW; 7747 default: 7748 return AArch64_AM::InvalidShiftExtend; 7749 } 7750 } 7751 7752 // Don't have an explicit extend. Try to handle a G_AND with a constant mask 7753 // on the RHS. 7754 if (Opc != TargetOpcode::G_AND) 7755 return AArch64_AM::InvalidShiftExtend; 7756 7757 std::optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2)); 7758 if (!MaybeAndMask) 7759 return AArch64_AM::InvalidShiftExtend; 7760 uint64_t AndMask = *MaybeAndMask; 7761 switch (AndMask) { 7762 default: 7763 return AArch64_AM::InvalidShiftExtend; 7764 case 0xFF: 7765 return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend; 7766 case 0xFFFF: 7767 return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend; 7768 case 0xFFFFFFFF: 7769 return AArch64_AM::UXTW; 7770 } 7771 } 7772 7773 Register AArch64InstructionSelector::moveScalarRegClass( 7774 Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const { 7775 MachineRegisterInfo &MRI = *MIB.getMRI(); 7776 auto Ty = MRI.getType(Reg); 7777 assert(!Ty.isVector() && "Expected scalars only!"); 7778 if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC)) 7779 return Reg; 7780 7781 // Create a copy and immediately select it. 7782 // FIXME: We should have an emitCopy function? 7783 auto Copy = MIB.buildCopy({&RC}, {Reg}); 7784 selectCopy(*Copy, TII, MRI, TRI, RBI); 7785 return Copy.getReg(0); 7786 } 7787 7788 /// Select an "extended register" operand. This operand folds in an extend 7789 /// followed by an optional left shift. 7790 InstructionSelector::ComplexRendererFns 7791 AArch64InstructionSelector::selectArithExtendedRegister( 7792 MachineOperand &Root) const { 7793 if (!Root.isReg()) 7794 return std::nullopt; 7795 MachineRegisterInfo &MRI = 7796 Root.getParent()->getParent()->getParent()->getRegInfo(); 7797 7798 uint64_t ShiftVal = 0; 7799 Register ExtReg; 7800 AArch64_AM::ShiftExtendType Ext; 7801 MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI); 7802 if (!RootDef) 7803 return std::nullopt; 7804 7805 if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI, false)) 7806 return std::nullopt; 7807 7808 // Check if we can fold a shift and an extend. 7809 if (RootDef->getOpcode() == TargetOpcode::G_SHL) { 7810 // Look for a constant on the RHS of the shift. 7811 MachineOperand &RHS = RootDef->getOperand(2); 7812 std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS); 7813 if (!MaybeShiftVal) 7814 return std::nullopt; 7815 ShiftVal = *MaybeShiftVal; 7816 if (ShiftVal > 4) 7817 return std::nullopt; 7818 // Look for a valid extend instruction on the LHS of the shift. 7819 MachineOperand &LHS = RootDef->getOperand(1); 7820 MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI); 7821 if (!ExtDef) 7822 return std::nullopt; 7823 Ext = getExtendTypeForInst(*ExtDef, MRI); 7824 if (Ext == AArch64_AM::InvalidShiftExtend) 7825 return std::nullopt; 7826 ExtReg = ExtDef->getOperand(1).getReg(); 7827 } else { 7828 // Didn't get a shift. Try just folding an extend. 7829 Ext = getExtendTypeForInst(*RootDef, MRI); 7830 if (Ext == AArch64_AM::InvalidShiftExtend) 7831 return std::nullopt; 7832 ExtReg = RootDef->getOperand(1).getReg(); 7833 7834 // If we have a 32 bit instruction which zeroes out the high half of a 7835 // register, we get an implicit zero extend for free. Check if we have one. 7836 // FIXME: We actually emit the extend right now even though we don't have 7837 // to. 7838 if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) { 7839 MachineInstr *ExtInst = MRI.getVRegDef(ExtReg); 7840 if (isDef32(*ExtInst)) 7841 return std::nullopt; 7842 } 7843 } 7844 7845 // We require a GPR32 here. Narrow the ExtReg if needed using a subregister 7846 // copy. 7847 MachineIRBuilder MIB(*RootDef); 7848 ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB); 7849 7850 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, 7851 [=](MachineInstrBuilder &MIB) { 7852 MIB.addImm(getArithExtendImm(Ext, ShiftVal)); 7853 }}}; 7854 } 7855 7856 InstructionSelector::ComplexRendererFns 7857 AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const { 7858 if (!Root.isReg()) 7859 return std::nullopt; 7860 MachineRegisterInfo &MRI = 7861 Root.getParent()->getParent()->getParent()->getRegInfo(); 7862 7863 auto Extract = getDefSrcRegIgnoringCopies(Root.getReg(), MRI); 7864 while (Extract && Extract->MI->getOpcode() == TargetOpcode::G_BITCAST && 7865 STI.isLittleEndian()) 7866 Extract = 7867 getDefSrcRegIgnoringCopies(Extract->MI->getOperand(1).getReg(), MRI); 7868 if (!Extract) 7869 return std::nullopt; 7870 7871 if (Extract->MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) { 7872 if (Extract->Reg == Extract->MI->getOperand(1).getReg()) { 7873 Register ExtReg = Extract->MI->getOperand(2).getReg(); 7874 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}}; 7875 } 7876 } 7877 if (Extract->MI->getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) { 7878 LLT SrcTy = MRI.getType(Extract->MI->getOperand(1).getReg()); 7879 auto LaneIdx = getIConstantVRegValWithLookThrough( 7880 Extract->MI->getOperand(2).getReg(), MRI); 7881 if (LaneIdx && SrcTy == LLT::fixed_vector(2, 64) && 7882 LaneIdx->Value.getSExtValue() == 1) { 7883 Register ExtReg = Extract->MI->getOperand(1).getReg(); 7884 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}}; 7885 } 7886 } 7887 7888 return std::nullopt; 7889 } 7890 7891 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, 7892 const MachineInstr &MI, 7893 int OpIdx) const { 7894 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 7895 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 7896 "Expected G_CONSTANT"); 7897 std::optional<int64_t> CstVal = 7898 getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI); 7899 assert(CstVal && "Expected constant value"); 7900 MIB.addImm(*CstVal); 7901 } 7902 7903 void AArch64InstructionSelector::renderLogicalImm32( 7904 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { 7905 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 7906 "Expected G_CONSTANT"); 7907 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); 7908 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32); 7909 MIB.addImm(Enc); 7910 } 7911 7912 void AArch64InstructionSelector::renderLogicalImm64( 7913 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { 7914 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 7915 "Expected G_CONSTANT"); 7916 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); 7917 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64); 7918 MIB.addImm(Enc); 7919 } 7920 7921 void AArch64InstructionSelector::renderUbsanTrap(MachineInstrBuilder &MIB, 7922 const MachineInstr &MI, 7923 int OpIdx) const { 7924 assert(MI.getOpcode() == TargetOpcode::G_UBSANTRAP && OpIdx == 0 && 7925 "Expected G_UBSANTRAP"); 7926 MIB.addImm(MI.getOperand(0).getImm() | ('U' << 8)); 7927 } 7928 7929 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB, 7930 const MachineInstr &MI, 7931 int OpIdx) const { 7932 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 7933 "Expected G_FCONSTANT"); 7934 MIB.addImm( 7935 AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 7936 } 7937 7938 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB, 7939 const MachineInstr &MI, 7940 int OpIdx) const { 7941 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 7942 "Expected G_FCONSTANT"); 7943 MIB.addImm( 7944 AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 7945 } 7946 7947 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB, 7948 const MachineInstr &MI, 7949 int OpIdx) const { 7950 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 7951 "Expected G_FCONSTANT"); 7952 MIB.addImm( 7953 AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF())); 7954 } 7955 7956 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4( 7957 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 7958 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && 7959 "Expected G_FCONSTANT"); 7960 MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1) 7961 .getFPImm() 7962 ->getValueAPF() 7963 .bitcastToAPInt() 7964 .getZExtValue())); 7965 } 7966 7967 bool AArch64InstructionSelector::isLoadStoreOfNumBytes( 7968 const MachineInstr &MI, unsigned NumBytes) const { 7969 if (!MI.mayLoadOrStore()) 7970 return false; 7971 assert(MI.hasOneMemOperand() && 7972 "Expected load/store to have only one mem op!"); 7973 return (*MI.memoperands_begin())->getSize() == NumBytes; 7974 } 7975 7976 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const { 7977 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 7978 if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32) 7979 return false; 7980 7981 // Only return true if we know the operation will zero-out the high half of 7982 // the 64-bit register. Truncates can be subregister copies, which don't 7983 // zero out the high bits. Copies and other copy-like instructions can be 7984 // fed by truncates, or could be lowered as subregister copies. 7985 switch (MI.getOpcode()) { 7986 default: 7987 return true; 7988 case TargetOpcode::COPY: 7989 case TargetOpcode::G_BITCAST: 7990 case TargetOpcode::G_TRUNC: 7991 case TargetOpcode::G_PHI: 7992 return false; 7993 } 7994 } 7995 7996 7997 // Perform fixups on the given PHI instruction's operands to force them all 7998 // to be the same as the destination regbank. 7999 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI, 8000 const AArch64RegisterBankInfo &RBI) { 8001 assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI"); 8002 Register DstReg = MI.getOperand(0).getReg(); 8003 const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg); 8004 assert(DstRB && "Expected PHI dst to have regbank assigned"); 8005 MachineIRBuilder MIB(MI); 8006 8007 // Go through each operand and ensure it has the same regbank. 8008 for (MachineOperand &MO : llvm::drop_begin(MI.operands())) { 8009 if (!MO.isReg()) 8010 continue; 8011 Register OpReg = MO.getReg(); 8012 const RegisterBank *RB = MRI.getRegBankOrNull(OpReg); 8013 if (RB != DstRB) { 8014 // Insert a cross-bank copy. 8015 auto *OpDef = MRI.getVRegDef(OpReg); 8016 const LLT &Ty = MRI.getType(OpReg); 8017 MachineBasicBlock &OpDefBB = *OpDef->getParent(); 8018 8019 // Any instruction we insert must appear after all PHIs in the block 8020 // for the block to be valid MIR. 8021 MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator()); 8022 if (InsertPt != OpDefBB.end() && InsertPt->isPHI()) 8023 InsertPt = OpDefBB.getFirstNonPHI(); 8024 MIB.setInsertPt(*OpDef->getParent(), InsertPt); 8025 auto Copy = MIB.buildCopy(Ty, OpReg); 8026 MRI.setRegBank(Copy.getReg(0), *DstRB); 8027 MO.setReg(Copy.getReg(0)); 8028 } 8029 } 8030 } 8031 8032 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) { 8033 // We're looking for PHIs, build a list so we don't invalidate iterators. 8034 MachineRegisterInfo &MRI = MF.getRegInfo(); 8035 SmallVector<MachineInstr *, 32> Phis; 8036 for (auto &BB : MF) { 8037 for (auto &MI : BB) { 8038 if (MI.getOpcode() == TargetOpcode::G_PHI) 8039 Phis.emplace_back(&MI); 8040 } 8041 } 8042 8043 for (auto *MI : Phis) { 8044 // We need to do some work here if the operand types are < 16 bit and they 8045 // are split across fpr/gpr banks. Since all types <32b on gpr 8046 // end up being assigned gpr32 regclasses, we can end up with PHIs here 8047 // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't 8048 // be selecting heterogenous regbanks for operands if possible, but we 8049 // still need to be able to deal with it here. 8050 // 8051 // To fix this, if we have a gpr-bank operand < 32b in size and at least 8052 // one other operand is on the fpr bank, then we add cross-bank copies 8053 // to homogenize the operand banks. For simplicity the bank that we choose 8054 // to settle on is whatever bank the def operand has. For example: 8055 // 8056 // %endbb: 8057 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2 8058 // => 8059 // %bb2: 8060 // ... 8061 // %in2_copy:gpr(s16) = COPY %in2:fpr(s16) 8062 // ... 8063 // %endbb: 8064 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2 8065 bool HasGPROp = false, HasFPROp = false; 8066 for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) { 8067 if (!MO.isReg()) 8068 continue; 8069 const LLT &Ty = MRI.getType(MO.getReg()); 8070 if (!Ty.isValid() || !Ty.isScalar()) 8071 break; 8072 if (Ty.getSizeInBits() >= 32) 8073 break; 8074 const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()); 8075 // If for some reason we don't have a regbank yet. Don't try anything. 8076 if (!RB) 8077 break; 8078 8079 if (RB->getID() == AArch64::GPRRegBankID) 8080 HasGPROp = true; 8081 else 8082 HasFPROp = true; 8083 } 8084 // We have heterogenous regbanks, need to fixup. 8085 if (HasGPROp && HasFPROp) 8086 fixupPHIOpBanks(*MI, MRI, RBI); 8087 } 8088 } 8089 8090 namespace llvm { 8091 InstructionSelector * 8092 createAArch64InstructionSelector(const AArch64TargetMachine &TM, 8093 const AArch64Subtarget &Subtarget, 8094 const AArch64RegisterBankInfo &RBI) { 8095 return new AArch64InstructionSelector(TM, Subtarget, RBI); 8096 } 8097 } 8098