1 //=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // before the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64GlobalISelUtils.h" 15 #include "AArch64TargetMachine.h" 16 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 17 #include "llvm/CodeGen/GlobalISel/Combiner.h" 18 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 19 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 20 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 24 #include "llvm/CodeGen/GlobalISel/Utils.h" 25 #include "llvm/CodeGen/MachineDominators.h" 26 #include "llvm/CodeGen/MachineFunction.h" 27 #include "llvm/CodeGen/MachineFunctionPass.h" 28 #include "llvm/CodeGen/MachineRegisterInfo.h" 29 #include "llvm/CodeGen/TargetPassConfig.h" 30 #include "llvm/IR/Instructions.h" 31 32 #define GET_GICOMBINER_DEPS 33 #include "AArch64GenPreLegalizeGICombiner.inc" 34 #undef GET_GICOMBINER_DEPS 35 36 #define DEBUG_TYPE "aarch64-prelegalizer-combiner" 37 38 using namespace llvm; 39 using namespace MIPatternMatch; 40 41 namespace { 42 43 #define GET_GICOMBINER_TYPES 44 #include "AArch64GenPreLegalizeGICombiner.inc" 45 #undef GET_GICOMBINER_TYPES 46 47 /// Return true if a G_FCONSTANT instruction is known to be better-represented 48 /// as a G_CONSTANT. 49 bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) { 50 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); 51 Register DstReg = MI.getOperand(0).getReg(); 52 const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); 53 if (DstSize != 32 && DstSize != 64) 54 return false; 55 56 // When we're storing a value, it doesn't matter what register bank it's on. 57 // Since not all floating point constants can be materialized using a fmov, 58 // it makes more sense to just use a GPR. 59 return all_of(MRI.use_nodbg_instructions(DstReg), 60 [](const MachineInstr &Use) { return Use.mayStore(); }); 61 } 62 63 /// Change a G_FCONSTANT into a G_CONSTANT. 64 void applyFConstantToConstant(MachineInstr &MI) { 65 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); 66 MachineIRBuilder MIB(MI); 67 const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF(); 68 MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt()); 69 MI.eraseFromParent(); 70 } 71 72 /// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits 73 /// are sign bits. In this case, we can transform the G_ICMP to directly compare 74 /// the wide value with a zero. 75 bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, 76 GISelKnownBits *KB, Register &MatchInfo) { 77 assert(MI.getOpcode() == TargetOpcode::G_ICMP && KB); 78 79 auto Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate(); 80 if (!ICmpInst::isEquality(Pred)) 81 return false; 82 83 Register LHS = MI.getOperand(2).getReg(); 84 LLT LHSTy = MRI.getType(LHS); 85 if (!LHSTy.isScalar()) 86 return false; 87 88 Register RHS = MI.getOperand(3).getReg(); 89 Register WideReg; 90 91 if (!mi_match(LHS, MRI, m_GTrunc(m_Reg(WideReg))) || 92 !mi_match(RHS, MRI, m_SpecificICst(0))) 93 return false; 94 95 LLT WideTy = MRI.getType(WideReg); 96 if (KB->computeNumSignBits(WideReg) <= 97 WideTy.getSizeInBits() - LHSTy.getSizeInBits()) 98 return false; 99 100 MatchInfo = WideReg; 101 return true; 102 } 103 104 void applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, 105 MachineIRBuilder &Builder, 106 GISelChangeObserver &Observer, Register &WideReg) { 107 assert(MI.getOpcode() == TargetOpcode::G_ICMP); 108 109 LLT WideTy = MRI.getType(WideReg); 110 // We're going to directly use the wide register as the LHS, and then use an 111 // equivalent size zero for RHS. 112 Builder.setInstrAndDebugLoc(MI); 113 auto WideZero = Builder.buildConstant(WideTy, 0); 114 Observer.changingInstr(MI); 115 MI.getOperand(2).setReg(WideReg); 116 MI.getOperand(3).setReg(WideZero.getReg(0)); 117 Observer.changedInstr(MI); 118 } 119 120 /// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE. 121 /// 122 /// e.g. 123 /// 124 /// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst 125 bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, 126 std::pair<uint64_t, uint64_t> &MatchInfo) { 127 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); 128 MachineFunction &MF = *MI.getMF(); 129 auto &GlobalOp = MI.getOperand(1); 130 auto *GV = GlobalOp.getGlobal(); 131 if (GV->isThreadLocal()) 132 return false; 133 134 // Don't allow anything that could represent offsets etc. 135 if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference( 136 GV, MF.getTarget()) != AArch64II::MO_NO_FLAG) 137 return false; 138 139 // Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants: 140 // 141 // %g = G_GLOBAL_VALUE @x 142 // %ptr1 = G_PTR_ADD %g, cst1 143 // %ptr2 = G_PTR_ADD %g, cst2 144 // ... 145 // %ptrN = G_PTR_ADD %g, cstN 146 // 147 // Identify the *smallest* constant. We want to be able to form this: 148 // 149 // %offset_g = G_GLOBAL_VALUE @x + min_cst 150 // %g = G_PTR_ADD %offset_g, -min_cst 151 // %ptr1 = G_PTR_ADD %g, cst1 152 // ... 153 Register Dst = MI.getOperand(0).getReg(); 154 uint64_t MinOffset = -1ull; 155 for (auto &UseInstr : MRI.use_nodbg_instructions(Dst)) { 156 if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD) 157 return false; 158 auto Cst = getIConstantVRegValWithLookThrough( 159 UseInstr.getOperand(2).getReg(), MRI); 160 if (!Cst) 161 return false; 162 MinOffset = std::min(MinOffset, Cst->Value.getZExtValue()); 163 } 164 165 // Require that the new offset is larger than the existing one to avoid 166 // infinite loops. 167 uint64_t CurrOffset = GlobalOp.getOffset(); 168 uint64_t NewOffset = MinOffset + CurrOffset; 169 if (NewOffset <= CurrOffset) 170 return false; 171 172 // Check whether folding this offset is legal. It must not go out of bounds of 173 // the referenced object to avoid violating the code model, and must be 174 // smaller than 2^20 because this is the largest offset expressible in all 175 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF 176 // stores an immediate signed 21 bit offset.) 177 // 178 // This check also prevents us from folding negative offsets, which will end 179 // up being treated in the same way as large positive ones. They could also 180 // cause code model violations, and aren't really common enough to matter. 181 if (NewOffset >= (1 << 20)) 182 return false; 183 184 Type *T = GV->getValueType(); 185 if (!T->isSized() || 186 NewOffset > GV->getDataLayout().getTypeAllocSize(T)) 187 return false; 188 MatchInfo = std::make_pair(NewOffset, MinOffset); 189 return true; 190 } 191 192 void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, 193 MachineIRBuilder &B, GISelChangeObserver &Observer, 194 std::pair<uint64_t, uint64_t> &MatchInfo) { 195 // Change: 196 // 197 // %g = G_GLOBAL_VALUE @x 198 // %ptr1 = G_PTR_ADD %g, cst1 199 // %ptr2 = G_PTR_ADD %g, cst2 200 // ... 201 // %ptrN = G_PTR_ADD %g, cstN 202 // 203 // To: 204 // 205 // %offset_g = G_GLOBAL_VALUE @x + min_cst 206 // %g = G_PTR_ADD %offset_g, -min_cst 207 // %ptr1 = G_PTR_ADD %g, cst1 208 // ... 209 // %ptrN = G_PTR_ADD %g, cstN 210 // 211 // Then, the original G_PTR_ADDs should be folded later on so that they look 212 // like this: 213 // 214 // %ptrN = G_PTR_ADD %offset_g, cstN - min_cst 215 uint64_t Offset, MinOffset; 216 std::tie(Offset, MinOffset) = MatchInfo; 217 B.setInstrAndDebugLoc(*std::next(MI.getIterator())); 218 Observer.changingInstr(MI); 219 auto &GlobalOp = MI.getOperand(1); 220 auto *GV = GlobalOp.getGlobal(); 221 GlobalOp.ChangeToGA(GV, Offset, GlobalOp.getTargetFlags()); 222 Register Dst = MI.getOperand(0).getReg(); 223 Register NewGVDst = MRI.cloneVirtualRegister(Dst); 224 MI.getOperand(0).setReg(NewGVDst); 225 Observer.changedInstr(MI); 226 B.buildPtrAdd( 227 Dst, NewGVDst, 228 B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset))); 229 } 230 231 // Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y)) 232 // Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1)) 233 // Similar to performVecReduceAddCombine in SelectionDAG 234 bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, 235 const AArch64Subtarget &STI, 236 std::tuple<Register, Register, bool> &MatchInfo) { 237 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && 238 "Expected a G_VECREDUCE_ADD instruction"); 239 assert(STI.hasDotProd() && "Target should have Dot Product feature"); 240 241 MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); 242 Register DstReg = MI.getOperand(0).getReg(); 243 Register MidReg = I1->getOperand(0).getReg(); 244 LLT DstTy = MRI.getType(DstReg); 245 LLT MidTy = MRI.getType(MidReg); 246 if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32) 247 return false; 248 249 LLT SrcTy; 250 auto I1Opc = I1->getOpcode(); 251 if (I1Opc == TargetOpcode::G_MUL) { 252 // If result of this has more than 1 use, then there is no point in creating 253 // udot instruction 254 if (!MRI.hasOneNonDBGUse(MidReg)) 255 return false; 256 257 MachineInstr *ExtMI1 = 258 getDefIgnoringCopies(I1->getOperand(1).getReg(), MRI); 259 MachineInstr *ExtMI2 = 260 getDefIgnoringCopies(I1->getOperand(2).getReg(), MRI); 261 LLT Ext1DstTy = MRI.getType(ExtMI1->getOperand(0).getReg()); 262 LLT Ext2DstTy = MRI.getType(ExtMI2->getOperand(0).getReg()); 263 264 if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy) 265 return false; 266 I1Opc = ExtMI1->getOpcode(); 267 SrcTy = MRI.getType(ExtMI1->getOperand(1).getReg()); 268 std::get<0>(MatchInfo) = ExtMI1->getOperand(1).getReg(); 269 std::get<1>(MatchInfo) = ExtMI2->getOperand(1).getReg(); 270 } else { 271 SrcTy = MRI.getType(I1->getOperand(1).getReg()); 272 std::get<0>(MatchInfo) = I1->getOperand(1).getReg(); 273 std::get<1>(MatchInfo) = 0; 274 } 275 276 if (I1Opc == TargetOpcode::G_ZEXT) 277 std::get<2>(MatchInfo) = 0; 278 else if (I1Opc == TargetOpcode::G_SEXT) 279 std::get<2>(MatchInfo) = 1; 280 else 281 return false; 282 283 if (SrcTy.getScalarSizeInBits() != 8 || SrcTy.getNumElements() % 8 != 0) 284 return false; 285 286 return true; 287 } 288 289 void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, 290 MachineIRBuilder &Builder, 291 GISelChangeObserver &Observer, 292 const AArch64Subtarget &STI, 293 std::tuple<Register, Register, bool> &MatchInfo) { 294 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && 295 "Expected a G_VECREDUCE_ADD instruction"); 296 assert(STI.hasDotProd() && "Target should have Dot Product feature"); 297 298 // Initialise the variables 299 unsigned DotOpcode = 300 std::get<2>(MatchInfo) ? AArch64::G_SDOT : AArch64::G_UDOT; 301 Register Ext1SrcReg = std::get<0>(MatchInfo); 302 303 // If there is one source register, create a vector of 0s as the second 304 // source register 305 Register Ext2SrcReg; 306 if (std::get<1>(MatchInfo) == 0) 307 Ext2SrcReg = Builder.buildConstant(MRI.getType(Ext1SrcReg), 1) 308 ->getOperand(0) 309 .getReg(); 310 else 311 Ext2SrcReg = std::get<1>(MatchInfo); 312 313 // Find out how many DOT instructions are needed 314 LLT SrcTy = MRI.getType(Ext1SrcReg); 315 LLT MidTy; 316 unsigned NumOfDotMI; 317 if (SrcTy.getNumElements() % 16 == 0) { 318 NumOfDotMI = SrcTy.getNumElements() / 16; 319 MidTy = LLT::fixed_vector(4, 32); 320 } else if (SrcTy.getNumElements() % 8 == 0) { 321 NumOfDotMI = SrcTy.getNumElements() / 8; 322 MidTy = LLT::fixed_vector(2, 32); 323 } else { 324 llvm_unreachable("Source type number of elements is not multiple of 8"); 325 } 326 327 // Handle case where one DOT instruction is needed 328 if (NumOfDotMI == 1) { 329 auto Zeroes = Builder.buildConstant(MidTy, 0)->getOperand(0).getReg(); 330 auto Dot = Builder.buildInstr(DotOpcode, {MidTy}, 331 {Zeroes, Ext1SrcReg, Ext2SrcReg}); 332 Builder.buildVecReduceAdd(MI.getOperand(0), Dot->getOperand(0)); 333 } else { 334 // If not pad the last v8 element with 0s to a v16 335 SmallVector<Register, 4> Ext1UnmergeReg; 336 SmallVector<Register, 4> Ext2UnmergeReg; 337 if (SrcTy.getNumElements() % 16 != 0) { 338 SmallVector<Register> Leftover1; 339 SmallVector<Register> Leftover2; 340 341 // Split the elements into v16i8 and v8i8 342 LLT MainTy = LLT::fixed_vector(16, 8); 343 LLT LeftoverTy1, LeftoverTy2; 344 if ((!extractParts(Ext1SrcReg, MRI.getType(Ext1SrcReg), MainTy, 345 LeftoverTy1, Ext1UnmergeReg, Leftover1, Builder, 346 MRI)) || 347 (!extractParts(Ext2SrcReg, MRI.getType(Ext2SrcReg), MainTy, 348 LeftoverTy2, Ext2UnmergeReg, Leftover2, Builder, 349 MRI))) { 350 llvm_unreachable("Unable to split this vector properly"); 351 } 352 353 // Pad the leftover v8i8 vector with register of 0s of type v8i8 354 Register v8Zeroes = Builder.buildConstant(LLT::fixed_vector(8, 8), 0) 355 ->getOperand(0) 356 .getReg(); 357 358 Ext1UnmergeReg.push_back( 359 Builder 360 .buildMergeLikeInstr(LLT::fixed_vector(16, 8), 361 {Leftover1[0], v8Zeroes}) 362 .getReg(0)); 363 Ext2UnmergeReg.push_back( 364 Builder 365 .buildMergeLikeInstr(LLT::fixed_vector(16, 8), 366 {Leftover2[0], v8Zeroes}) 367 .getReg(0)); 368 369 } else { 370 // Unmerge the source vectors to v16i8 371 unsigned SrcNumElts = SrcTy.getNumElements(); 372 extractParts(Ext1SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16, 373 Ext1UnmergeReg, Builder, MRI); 374 extractParts(Ext2SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16, 375 Ext2UnmergeReg, Builder, MRI); 376 } 377 378 // Build the UDOT instructions 379 SmallVector<Register, 2> DotReg; 380 unsigned NumElements = 0; 381 for (unsigned i = 0; i < Ext1UnmergeReg.size(); i++) { 382 LLT ZeroesLLT; 383 // Check if it is 16 or 8 elements. Set Zeroes to the according size 384 if (MRI.getType(Ext1UnmergeReg[i]).getNumElements() == 16) { 385 ZeroesLLT = LLT::fixed_vector(4, 32); 386 NumElements += 4; 387 } else { 388 ZeroesLLT = LLT::fixed_vector(2, 32); 389 NumElements += 2; 390 } 391 auto Zeroes = Builder.buildConstant(ZeroesLLT, 0)->getOperand(0).getReg(); 392 DotReg.push_back( 393 Builder 394 .buildInstr(DotOpcode, {MRI.getType(Zeroes)}, 395 {Zeroes, Ext1UnmergeReg[i], Ext2UnmergeReg[i]}) 396 .getReg(0)); 397 } 398 399 // Merge the output 400 auto ConcatMI = 401 Builder.buildConcatVectors(LLT::fixed_vector(NumElements, 32), DotReg); 402 403 // Put it through a vector reduction 404 Builder.buildVecReduceAdd(MI.getOperand(0).getReg(), 405 ConcatMI->getOperand(0).getReg()); 406 } 407 408 // Erase the dead instructions 409 MI.eraseFromParent(); 410 } 411 412 // Matches {U/S}ADDV(ext(x)) => {U/S}ADDLV(x) 413 // Ensure that the type coming from the extend instruction is the right size 414 bool matchExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI, 415 std::pair<Register, bool> &MatchInfo) { 416 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && 417 "Expected G_VECREDUCE_ADD Opcode"); 418 419 // Check if the last instruction is an extend 420 MachineInstr *ExtMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); 421 auto ExtOpc = ExtMI->getOpcode(); 422 423 if (ExtOpc == TargetOpcode::G_ZEXT) 424 std::get<1>(MatchInfo) = 0; 425 else if (ExtOpc == TargetOpcode::G_SEXT) 426 std::get<1>(MatchInfo) = 1; 427 else 428 return false; 429 430 // Check if the source register is a valid type 431 Register ExtSrcReg = ExtMI->getOperand(1).getReg(); 432 LLT ExtSrcTy = MRI.getType(ExtSrcReg); 433 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 434 if ((DstTy.getScalarSizeInBits() == 16 && 435 ExtSrcTy.getNumElements() % 8 == 0 && ExtSrcTy.getNumElements() < 256) || 436 (DstTy.getScalarSizeInBits() == 32 && 437 ExtSrcTy.getNumElements() % 4 == 0) || 438 (DstTy.getScalarSizeInBits() == 64 && 439 ExtSrcTy.getNumElements() % 4 == 0)) { 440 std::get<0>(MatchInfo) = ExtSrcReg; 441 return true; 442 } 443 return false; 444 } 445 446 void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI, 447 MachineIRBuilder &B, GISelChangeObserver &Observer, 448 std::pair<Register, bool> &MatchInfo) { 449 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && 450 "Expected G_VECREDUCE_ADD Opcode"); 451 452 unsigned Opc = std::get<1>(MatchInfo) ? AArch64::G_SADDLV : AArch64::G_UADDLV; 453 Register SrcReg = std::get<0>(MatchInfo); 454 Register DstReg = MI.getOperand(0).getReg(); 455 LLT SrcTy = MRI.getType(SrcReg); 456 LLT DstTy = MRI.getType(DstReg); 457 458 // If SrcTy has more elements than expected, split them into multiple 459 // insructions and sum the results 460 LLT MainTy; 461 SmallVector<Register, 1> WorkingRegisters; 462 unsigned SrcScalSize = SrcTy.getScalarSizeInBits(); 463 unsigned SrcNumElem = SrcTy.getNumElements(); 464 if ((SrcScalSize == 8 && SrcNumElem > 16) || 465 (SrcScalSize == 16 && SrcNumElem > 8) || 466 (SrcScalSize == 32 && SrcNumElem > 4)) { 467 468 LLT LeftoverTy; 469 SmallVector<Register, 4> LeftoverRegs; 470 if (SrcScalSize == 8) 471 MainTy = LLT::fixed_vector(16, 8); 472 else if (SrcScalSize == 16) 473 MainTy = LLT::fixed_vector(8, 16); 474 else if (SrcScalSize == 32) 475 MainTy = LLT::fixed_vector(4, 32); 476 else 477 llvm_unreachable("Source's Scalar Size not supported"); 478 479 // Extract the parts and put each extracted sources through U/SADDLV and put 480 // the values inside a small vec 481 extractParts(SrcReg, SrcTy, MainTy, LeftoverTy, WorkingRegisters, 482 LeftoverRegs, B, MRI); 483 for (unsigned I = 0; I < LeftoverRegs.size(); I++) { 484 WorkingRegisters.push_back(LeftoverRegs[I]); 485 } 486 } else { 487 WorkingRegisters.push_back(SrcReg); 488 MainTy = SrcTy; 489 } 490 491 unsigned MidScalarSize = MainTy.getScalarSizeInBits() * 2; 492 LLT MidScalarLLT = LLT::scalar(MidScalarSize); 493 Register zeroReg = B.buildConstant(LLT::scalar(64), 0).getReg(0); 494 for (unsigned I = 0; I < WorkingRegisters.size(); I++) { 495 // If the number of elements is too small to build an instruction, extend 496 // its size before applying addlv 497 LLT WorkingRegTy = MRI.getType(WorkingRegisters[I]); 498 if ((WorkingRegTy.getScalarSizeInBits() == 8) && 499 (WorkingRegTy.getNumElements() == 4)) { 500 WorkingRegisters[I] = 501 B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT 502 : TargetOpcode::G_ZEXT, 503 {LLT::fixed_vector(4, 16)}, {WorkingRegisters[I]}) 504 .getReg(0); 505 } 506 507 // Generate the {U/S}ADDLV instruction, whose output is always double of the 508 // Src's Scalar size 509 LLT addlvTy = MidScalarSize <= 32 ? LLT::fixed_vector(4, 32) 510 : LLT::fixed_vector(2, 64); 511 Register addlvReg = 512 B.buildInstr(Opc, {addlvTy}, {WorkingRegisters[I]}).getReg(0); 513 514 // The output from {U/S}ADDLV gets placed in the lowest lane of a v4i32 or 515 // v2i64 register. 516 // i16, i32 results uses v4i32 registers 517 // i64 results uses v2i64 registers 518 // Therefore we have to extract/truncate the the value to the right type 519 if (MidScalarSize == 32 || MidScalarSize == 64) { 520 WorkingRegisters[I] = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, 521 {MidScalarLLT}, {addlvReg, zeroReg}) 522 .getReg(0); 523 } else { 524 Register extractReg = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, 525 {LLT::scalar(32)}, {addlvReg, zeroReg}) 526 .getReg(0); 527 WorkingRegisters[I] = 528 B.buildTrunc({MidScalarLLT}, {extractReg}).getReg(0); 529 } 530 } 531 532 Register outReg; 533 if (WorkingRegisters.size() > 1) { 534 outReg = B.buildAdd(MidScalarLLT, WorkingRegisters[0], WorkingRegisters[1]) 535 .getReg(0); 536 for (unsigned I = 2; I < WorkingRegisters.size(); I++) { 537 outReg = B.buildAdd(MidScalarLLT, outReg, WorkingRegisters[I]).getReg(0); 538 } 539 } else { 540 outReg = WorkingRegisters[0]; 541 } 542 543 if (DstTy.getScalarSizeInBits() > MidScalarSize) { 544 // Handle the scalar value if the DstTy's Scalar Size is more than double 545 // Src's ScalarType 546 B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT 547 : TargetOpcode::G_ZEXT, 548 {DstReg}, {outReg}); 549 } else { 550 B.buildCopy(DstReg, outReg); 551 } 552 553 MI.eraseFromParent(); 554 } 555 556 // Pushes ADD/SUB through extend instructions to decrease the number of extend 557 // instruction at the end by allowing selection of {s|u}addl sooner 558 559 // i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8)) 560 bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI, 561 Register DstReg, Register SrcReg1, Register SrcReg2) { 562 assert((MI.getOpcode() == TargetOpcode::G_ADD || 563 MI.getOpcode() == TargetOpcode::G_SUB) && 564 "Expected a G_ADD or G_SUB instruction\n"); 565 566 // Deal with vector types only 567 LLT DstTy = MRI.getType(DstReg); 568 if (!DstTy.isVector()) 569 return false; 570 571 // Return true if G_{S|Z}EXT instruction is more than 2* source 572 Register ExtDstReg = MI.getOperand(1).getReg(); 573 LLT Ext1SrcTy = MRI.getType(SrcReg1); 574 LLT Ext2SrcTy = MRI.getType(SrcReg2); 575 unsigned ExtDstScal = MRI.getType(ExtDstReg).getScalarSizeInBits(); 576 unsigned Ext1SrcScal = Ext1SrcTy.getScalarSizeInBits(); 577 if (((Ext1SrcScal == 8 && ExtDstScal == 32) || 578 ((Ext1SrcScal == 8 || Ext1SrcScal == 16) && ExtDstScal == 64)) && 579 Ext1SrcTy == Ext2SrcTy) 580 return true; 581 582 return false; 583 } 584 585 void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI, 586 MachineIRBuilder &B, bool isSExt, Register DstReg, 587 Register SrcReg1, Register SrcReg2) { 588 LLT SrcTy = MRI.getType(SrcReg1); 589 LLT MidTy = SrcTy.changeElementSize(SrcTy.getScalarSizeInBits() * 2); 590 unsigned Opc = isSExt ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; 591 Register Ext1Reg = B.buildInstr(Opc, {MidTy}, {SrcReg1}).getReg(0); 592 Register Ext2Reg = B.buildInstr(Opc, {MidTy}, {SrcReg2}).getReg(0); 593 Register AddReg = 594 B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0); 595 596 // G_SUB has to sign-extend the result. 597 // G_ADD needs to sext from sext and can sext or zext from zext, so the 598 // original opcode is used. 599 if (MI.getOpcode() == TargetOpcode::G_ADD) 600 B.buildInstr(Opc, {DstReg}, {AddReg}); 601 else 602 B.buildSExt(DstReg, AddReg); 603 604 MI.eraseFromParent(); 605 } 606 607 bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B, 608 const CombinerHelper &Helper, 609 GISelChangeObserver &Observer) { 610 // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if 611 // result is only used in the no-overflow case. It is restricted to cases 612 // where we know that the high-bits of the operands are 0. If there's an 613 // overflow, then the 9th or 17th bit must be set, which can be checked 614 // using TBNZ. 615 // 616 // Change (for UADDOs on 8 and 16 bits): 617 // 618 // %z0 = G_ASSERT_ZEXT _ 619 // %op0 = G_TRUNC %z0 620 // %z1 = G_ASSERT_ZEXT _ 621 // %op1 = G_TRUNC %z1 622 // %val, %cond = G_UADDO %op0, %op1 623 // G_BRCOND %cond, %error.bb 624 // 625 // error.bb: 626 // (no successors and no uses of %val) 627 // 628 // To: 629 // 630 // %z0 = G_ASSERT_ZEXT _ 631 // %z1 = G_ASSERT_ZEXT _ 632 // %add = G_ADD %z0, %z1 633 // %val = G_TRUNC %add 634 // %bit = G_AND %add, 1 << scalar-size-in-bits(%op1) 635 // %cond = G_ICMP NE, %bit, 0 636 // G_BRCOND %cond, %error.bb 637 638 auto &MRI = *B.getMRI(); 639 640 MachineOperand *DefOp0 = MRI.getOneDef(MI.getOperand(2).getReg()); 641 MachineOperand *DefOp1 = MRI.getOneDef(MI.getOperand(3).getReg()); 642 Register Op0Wide; 643 Register Op1Wide; 644 if (!mi_match(DefOp0->getParent(), MRI, m_GTrunc(m_Reg(Op0Wide))) || 645 !mi_match(DefOp1->getParent(), MRI, m_GTrunc(m_Reg(Op1Wide)))) 646 return false; 647 LLT WideTy0 = MRI.getType(Op0Wide); 648 LLT WideTy1 = MRI.getType(Op1Wide); 649 Register ResVal = MI.getOperand(0).getReg(); 650 LLT OpTy = MRI.getType(ResVal); 651 MachineInstr *Op0WideDef = MRI.getVRegDef(Op0Wide); 652 MachineInstr *Op1WideDef = MRI.getVRegDef(Op1Wide); 653 654 unsigned OpTySize = OpTy.getScalarSizeInBits(); 655 // First check that the G_TRUNC feeding the G_UADDO are no-ops, because the 656 // inputs have been zero-extended. 657 if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT || 658 Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT || 659 OpTySize != Op0WideDef->getOperand(2).getImm() || 660 OpTySize != Op1WideDef->getOperand(2).getImm()) 661 return false; 662 663 // Only scalar UADDO with either 8 or 16 bit operands are handled. 664 if (!WideTy0.isScalar() || !WideTy1.isScalar() || WideTy0 != WideTy1 || 665 OpTySize >= WideTy0.getScalarSizeInBits() || 666 (OpTySize != 8 && OpTySize != 16)) 667 return false; 668 669 // The overflow-status result must be used by a branch only. 670 Register ResStatus = MI.getOperand(1).getReg(); 671 if (!MRI.hasOneNonDBGUse(ResStatus)) 672 return false; 673 MachineInstr *CondUser = &*MRI.use_instr_nodbg_begin(ResStatus); 674 if (CondUser->getOpcode() != TargetOpcode::G_BRCOND) 675 return false; 676 677 // Make sure the computed result is only used in the no-overflow blocks. 678 MachineBasicBlock *CurrentMBB = MI.getParent(); 679 MachineBasicBlock *FailMBB = CondUser->getOperand(1).getMBB(); 680 if (!FailMBB->succ_empty() || CondUser->getParent() != CurrentMBB) 681 return false; 682 if (any_of(MRI.use_nodbg_instructions(ResVal), 683 [&MI, FailMBB, CurrentMBB](MachineInstr &I) { 684 return &MI != &I && 685 (I.getParent() == FailMBB || I.getParent() == CurrentMBB); 686 })) 687 return false; 688 689 // Remove G_ADDO. 690 B.setInstrAndDebugLoc(*MI.getNextNode()); 691 MI.eraseFromParent(); 692 693 // Emit wide add. 694 Register AddDst = MRI.cloneVirtualRegister(Op0Wide); 695 B.buildInstr(TargetOpcode::G_ADD, {AddDst}, {Op0Wide, Op1Wide}); 696 697 // Emit check of the 9th or 17th bit and update users (the branch). This will 698 // later be folded to TBNZ. 699 Register CondBit = MRI.cloneVirtualRegister(Op0Wide); 700 B.buildAnd( 701 CondBit, AddDst, 702 B.buildConstant(LLT::scalar(32), OpTySize == 8 ? 1 << 8 : 1 << 16)); 703 B.buildICmp(CmpInst::ICMP_NE, ResStatus, CondBit, 704 B.buildConstant(LLT::scalar(32), 0)); 705 706 // Update ZEXts users of the result value. Because all uses are in the 707 // no-overflow case, we know that the top bits are 0 and we can ignore ZExts. 708 B.buildZExtOrTrunc(ResVal, AddDst); 709 for (MachineOperand &U : make_early_inc_range(MRI.use_operands(ResVal))) { 710 Register WideReg; 711 if (mi_match(U.getParent(), MRI, m_GZExt(m_Reg(WideReg)))) { 712 auto OldR = U.getParent()->getOperand(0).getReg(); 713 Observer.erasingInstr(*U.getParent()); 714 U.getParent()->eraseFromParent(); 715 Helper.replaceRegWith(MRI, OldR, AddDst); 716 } 717 } 718 719 return true; 720 } 721 722 class AArch64PreLegalizerCombinerImpl : public Combiner { 723 protected: 724 const CombinerHelper Helper; 725 const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig; 726 const AArch64Subtarget &STI; 727 728 public: 729 AArch64PreLegalizerCombinerImpl( 730 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 731 GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 732 const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig, 733 const AArch64Subtarget &STI, MachineDominatorTree *MDT, 734 const LegalizerInfo *LI); 735 736 static const char *getName() { return "AArch6400PreLegalizerCombiner"; } 737 738 bool tryCombineAll(MachineInstr &I) const override; 739 740 bool tryCombineAllImpl(MachineInstr &I) const; 741 742 private: 743 #define GET_GICOMBINER_CLASS_MEMBERS 744 #include "AArch64GenPreLegalizeGICombiner.inc" 745 #undef GET_GICOMBINER_CLASS_MEMBERS 746 }; 747 748 #define GET_GICOMBINER_IMPL 749 #include "AArch64GenPreLegalizeGICombiner.inc" 750 #undef GET_GICOMBINER_IMPL 751 752 AArch64PreLegalizerCombinerImpl::AArch64PreLegalizerCombinerImpl( 753 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 754 GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 755 const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig, 756 const AArch64Subtarget &STI, MachineDominatorTree *MDT, 757 const LegalizerInfo *LI) 758 : Combiner(MF, CInfo, TPC, &KB, CSEInfo), 759 Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI), 760 RuleConfig(RuleConfig), STI(STI), 761 #define GET_GICOMBINER_CONSTRUCTOR_INITS 762 #include "AArch64GenPreLegalizeGICombiner.inc" 763 #undef GET_GICOMBINER_CONSTRUCTOR_INITS 764 { 765 } 766 767 bool AArch64PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { 768 if (tryCombineAllImpl(MI)) 769 return true; 770 771 unsigned Opc = MI.getOpcode(); 772 switch (Opc) { 773 case TargetOpcode::G_SHUFFLE_VECTOR: 774 return Helper.tryCombineShuffleVector(MI); 775 case TargetOpcode::G_UADDO: 776 return tryToSimplifyUADDO(MI, B, Helper, Observer); 777 case TargetOpcode::G_MEMCPY_INLINE: 778 return Helper.tryEmitMemcpyInline(MI); 779 case TargetOpcode::G_MEMCPY: 780 case TargetOpcode::G_MEMMOVE: 781 case TargetOpcode::G_MEMSET: { 782 // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other 783 // heuristics decide. 784 unsigned MaxLen = CInfo.EnableOpt ? 0 : 32; 785 // Try to inline memcpy type calls if optimizations are enabled. 786 if (Helper.tryCombineMemCpyFamily(MI, MaxLen)) 787 return true; 788 if (Opc == TargetOpcode::G_MEMSET) 789 return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, CInfo.EnableMinSize); 790 return false; 791 } 792 } 793 794 return false; 795 } 796 797 // Pass boilerplate 798 // ================ 799 800 class AArch64PreLegalizerCombiner : public MachineFunctionPass { 801 public: 802 static char ID; 803 804 AArch64PreLegalizerCombiner(); 805 806 StringRef getPassName() const override { 807 return "AArch64PreLegalizerCombiner"; 808 } 809 810 bool runOnMachineFunction(MachineFunction &MF) override; 811 812 void getAnalysisUsage(AnalysisUsage &AU) const override; 813 814 private: 815 AArch64PreLegalizerCombinerImplRuleConfig RuleConfig; 816 }; 817 } // end anonymous namespace 818 819 void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 820 AU.addRequired<TargetPassConfig>(); 821 AU.setPreservesCFG(); 822 getSelectionDAGFallbackAnalysisUsage(AU); 823 AU.addRequired<GISelKnownBitsAnalysis>(); 824 AU.addPreserved<GISelKnownBitsAnalysis>(); 825 AU.addRequired<MachineDominatorTreeWrapperPass>(); 826 AU.addPreserved<MachineDominatorTreeWrapperPass>(); 827 AU.addRequired<GISelCSEAnalysisWrapperPass>(); 828 AU.addPreserved<GISelCSEAnalysisWrapperPass>(); 829 MachineFunctionPass::getAnalysisUsage(AU); 830 } 831 832 AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner() 833 : MachineFunctionPass(ID) { 834 initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 835 836 if (!RuleConfig.parseCommandLineOption()) 837 report_fatal_error("Invalid rule identifier"); 838 } 839 840 bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 841 if (MF.getProperties().hasProperty( 842 MachineFunctionProperties::Property::FailedISel)) 843 return false; 844 auto &TPC = getAnalysis<TargetPassConfig>(); 845 846 // Enable CSE. 847 GISelCSEAnalysisWrapper &Wrapper = 848 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); 849 auto *CSEInfo = &Wrapper.get(TPC.getCSEConfig()); 850 851 const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>(); 852 const auto *LI = ST.getLegalizerInfo(); 853 854 const Function &F = MF.getFunction(); 855 bool EnableOpt = 856 MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); 857 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 858 MachineDominatorTree *MDT = 859 &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 860 CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, 861 /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(), 862 F.hasMinSize()); 863 // Disable fixed-point iteration to reduce compile-time 864 CInfo.MaxIterations = 1; 865 CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass; 866 // This is the first Combiner, so the input IR might contain dead 867 // instructions. 868 CInfo.EnableFullDCE = true; 869 AArch64PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *KB, CSEInfo, 870 RuleConfig, ST, MDT, LI); 871 return Impl.combineMachineInstrs(); 872 } 873 874 char AArch64PreLegalizerCombiner::ID = 0; 875 INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE, 876 "Combine AArch64 machine instrs before legalization", 877 false, false) 878 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 879 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 880 INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) 881 INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE, 882 "Combine AArch64 machine instrs before legalization", false, 883 false) 884 885 namespace llvm { 886 FunctionPass *createAArch64PreLegalizerCombiner() { 887 return new AArch64PreLegalizerCombiner(); 888 } 889 } // end namespace llvm 890