xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
15ffd83dbSDimitry Andric //=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===//
25ffd83dbSDimitry Andric //
35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
65ffd83dbSDimitry Andric //
75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
85ffd83dbSDimitry Andric //
95ffd83dbSDimitry Andric // This pass does combining of machine instructions at the generic MI level,
105ffd83dbSDimitry Andric // before the legalizer.
115ffd83dbSDimitry Andric //
125ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
135ffd83dbSDimitry Andric 
14fe6060f1SDimitry Andric #include "AArch64GlobalISelUtils.h"
155ffd83dbSDimitry Andric #include "AArch64TargetMachine.h"
1681ad6265SDimitry Andric #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
175ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/Combiner.h"
185ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
195ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
2006c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
215ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
225ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23fe6060f1SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
247a6dacacSDimitry Andric #include "llvm/CodeGen/GlobalISel/Utils.h"
255ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineDominators.h"
26fe6060f1SDimitry Andric #include "llvm/CodeGen/MachineFunction.h"
275ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
28fe6060f1SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h"
295ffd83dbSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
30fe6060f1SDimitry Andric #include "llvm/IR/Instructions.h"
315ffd83dbSDimitry Andric #include "llvm/Support/Debug.h"
325ffd83dbSDimitry Andric 
3306c3fb27SDimitry Andric #define GET_GICOMBINER_DEPS
3406c3fb27SDimitry Andric #include "AArch64GenPreLegalizeGICombiner.inc"
3506c3fb27SDimitry Andric #undef GET_GICOMBINER_DEPS
3606c3fb27SDimitry Andric 
375ffd83dbSDimitry Andric #define DEBUG_TYPE "aarch64-prelegalizer-combiner"
385ffd83dbSDimitry Andric 
395ffd83dbSDimitry Andric using namespace llvm;
405ffd83dbSDimitry Andric using namespace MIPatternMatch;
415ffd83dbSDimitry Andric 
4206c3fb27SDimitry Andric namespace {
4306c3fb27SDimitry Andric 
4406c3fb27SDimitry Andric #define GET_GICOMBINER_TYPES
4506c3fb27SDimitry Andric #include "AArch64GenPreLegalizeGICombiner.inc"
4606c3fb27SDimitry Andric #undef GET_GICOMBINER_TYPES
4706c3fb27SDimitry Andric 
485ffd83dbSDimitry Andric /// Return true if a G_FCONSTANT instruction is known to be better-represented
495ffd83dbSDimitry Andric /// as a G_CONSTANT.
5006c3fb27SDimitry Andric bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) {
515ffd83dbSDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
525ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
535ffd83dbSDimitry Andric   const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
545ffd83dbSDimitry Andric   if (DstSize != 32 && DstSize != 64)
555ffd83dbSDimitry Andric     return false;
565ffd83dbSDimitry Andric 
575ffd83dbSDimitry Andric   // When we're storing a value, it doesn't matter what register bank it's on.
585ffd83dbSDimitry Andric   // Since not all floating point constants can be materialized using a fmov,
595ffd83dbSDimitry Andric   // it makes more sense to just use a GPR.
605ffd83dbSDimitry Andric   return all_of(MRI.use_nodbg_instructions(DstReg),
615ffd83dbSDimitry Andric                 [](const MachineInstr &Use) { return Use.mayStore(); });
625ffd83dbSDimitry Andric }
635ffd83dbSDimitry Andric 
645ffd83dbSDimitry Andric /// Change a G_FCONSTANT into a G_CONSTANT.
6506c3fb27SDimitry Andric void applyFConstantToConstant(MachineInstr &MI) {
665ffd83dbSDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
675ffd83dbSDimitry Andric   MachineIRBuilder MIB(MI);
685ffd83dbSDimitry Andric   const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF();
695ffd83dbSDimitry Andric   MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt());
705ffd83dbSDimitry Andric   MI.eraseFromParent();
715ffd83dbSDimitry Andric }
725ffd83dbSDimitry Andric 
73fe6060f1SDimitry Andric /// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits
74fe6060f1SDimitry Andric /// are sign bits. In this case, we can transform the G_ICMP to directly compare
75fe6060f1SDimitry Andric /// the wide value with a zero.
7606c3fb27SDimitry Andric bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
77fe6060f1SDimitry Andric                              GISelKnownBits *KB, Register &MatchInfo) {
78fe6060f1SDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_ICMP && KB);
79fe6060f1SDimitry Andric 
80fe6060f1SDimitry Andric   auto Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate();
81fe6060f1SDimitry Andric   if (!ICmpInst::isEquality(Pred))
82fe6060f1SDimitry Andric     return false;
83fe6060f1SDimitry Andric 
84fe6060f1SDimitry Andric   Register LHS = MI.getOperand(2).getReg();
85fe6060f1SDimitry Andric   LLT LHSTy = MRI.getType(LHS);
86fe6060f1SDimitry Andric   if (!LHSTy.isScalar())
87fe6060f1SDimitry Andric     return false;
88fe6060f1SDimitry Andric 
89fe6060f1SDimitry Andric   Register RHS = MI.getOperand(3).getReg();
90fe6060f1SDimitry Andric   Register WideReg;
91fe6060f1SDimitry Andric 
92fe6060f1SDimitry Andric   if (!mi_match(LHS, MRI, m_GTrunc(m_Reg(WideReg))) ||
93fe6060f1SDimitry Andric       !mi_match(RHS, MRI, m_SpecificICst(0)))
94fe6060f1SDimitry Andric     return false;
95fe6060f1SDimitry Andric 
96fe6060f1SDimitry Andric   LLT WideTy = MRI.getType(WideReg);
97fe6060f1SDimitry Andric   if (KB->computeNumSignBits(WideReg) <=
98fe6060f1SDimitry Andric       WideTy.getSizeInBits() - LHSTy.getSizeInBits())
99fe6060f1SDimitry Andric     return false;
100fe6060f1SDimitry Andric 
101fe6060f1SDimitry Andric   MatchInfo = WideReg;
102fe6060f1SDimitry Andric   return true;
103fe6060f1SDimitry Andric }
104fe6060f1SDimitry Andric 
10506c3fb27SDimitry Andric void applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
106fe6060f1SDimitry Andric                              MachineIRBuilder &Builder,
10706c3fb27SDimitry Andric                              GISelChangeObserver &Observer, Register &WideReg) {
108fe6060f1SDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_ICMP);
109fe6060f1SDimitry Andric 
110fe6060f1SDimitry Andric   LLT WideTy = MRI.getType(WideReg);
111fe6060f1SDimitry Andric   // We're going to directly use the wide register as the LHS, and then use an
112fe6060f1SDimitry Andric   // equivalent size zero for RHS.
113fe6060f1SDimitry Andric   Builder.setInstrAndDebugLoc(MI);
114fe6060f1SDimitry Andric   auto WideZero = Builder.buildConstant(WideTy, 0);
115fe6060f1SDimitry Andric   Observer.changingInstr(MI);
116fe6060f1SDimitry Andric   MI.getOperand(2).setReg(WideReg);
117fe6060f1SDimitry Andric   MI.getOperand(3).setReg(WideZero.getReg(0));
118fe6060f1SDimitry Andric   Observer.changedInstr(MI);
119fe6060f1SDimitry Andric }
120fe6060f1SDimitry Andric 
121fe6060f1SDimitry Andric /// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE.
122fe6060f1SDimitry Andric ///
123fe6060f1SDimitry Andric /// e.g.
124fe6060f1SDimitry Andric ///
125fe6060f1SDimitry Andric /// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst
12606c3fb27SDimitry Andric bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
127fe6060f1SDimitry Andric                            std::pair<uint64_t, uint64_t> &MatchInfo) {
128fe6060f1SDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
129fe6060f1SDimitry Andric   MachineFunction &MF = *MI.getMF();
130fe6060f1SDimitry Andric   auto &GlobalOp = MI.getOperand(1);
131fe6060f1SDimitry Andric   auto *GV = GlobalOp.getGlobal();
132fe6060f1SDimitry Andric   if (GV->isThreadLocal())
133fe6060f1SDimitry Andric     return false;
134fe6060f1SDimitry Andric 
135fe6060f1SDimitry Andric   // Don't allow anything that could represent offsets etc.
136fe6060f1SDimitry Andric   if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference(
137fe6060f1SDimitry Andric           GV, MF.getTarget()) != AArch64II::MO_NO_FLAG)
138fe6060f1SDimitry Andric     return false;
139fe6060f1SDimitry Andric 
140fe6060f1SDimitry Andric   // Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants:
141fe6060f1SDimitry Andric   //
142fe6060f1SDimitry Andric   //  %g = G_GLOBAL_VALUE @x
143fe6060f1SDimitry Andric   //  %ptr1 = G_PTR_ADD %g, cst1
144fe6060f1SDimitry Andric   //  %ptr2 = G_PTR_ADD %g, cst2
145fe6060f1SDimitry Andric   //  ...
146fe6060f1SDimitry Andric   //  %ptrN = G_PTR_ADD %g, cstN
147fe6060f1SDimitry Andric   //
148fe6060f1SDimitry Andric   // Identify the *smallest* constant. We want to be able to form this:
149fe6060f1SDimitry Andric   //
150fe6060f1SDimitry Andric   //  %offset_g = G_GLOBAL_VALUE @x + min_cst
151fe6060f1SDimitry Andric   //  %g = G_PTR_ADD %offset_g, -min_cst
152fe6060f1SDimitry Andric   //  %ptr1 = G_PTR_ADD %g, cst1
153fe6060f1SDimitry Andric   //  ...
154fe6060f1SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
155fe6060f1SDimitry Andric   uint64_t MinOffset = -1ull;
156fe6060f1SDimitry Andric   for (auto &UseInstr : MRI.use_nodbg_instructions(Dst)) {
157fe6060f1SDimitry Andric     if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD)
158fe6060f1SDimitry Andric       return false;
159349cc55cSDimitry Andric     auto Cst = getIConstantVRegValWithLookThrough(
160349cc55cSDimitry Andric         UseInstr.getOperand(2).getReg(), MRI);
161fe6060f1SDimitry Andric     if (!Cst)
162fe6060f1SDimitry Andric       return false;
163fe6060f1SDimitry Andric     MinOffset = std::min(MinOffset, Cst->Value.getZExtValue());
164fe6060f1SDimitry Andric   }
165fe6060f1SDimitry Andric 
166fe6060f1SDimitry Andric   // Require that the new offset is larger than the existing one to avoid
167fe6060f1SDimitry Andric   // infinite loops.
168fe6060f1SDimitry Andric   uint64_t CurrOffset = GlobalOp.getOffset();
169fe6060f1SDimitry Andric   uint64_t NewOffset = MinOffset + CurrOffset;
170fe6060f1SDimitry Andric   if (NewOffset <= CurrOffset)
171fe6060f1SDimitry Andric     return false;
172fe6060f1SDimitry Andric 
173fe6060f1SDimitry Andric   // Check whether folding this offset is legal. It must not go out of bounds of
174fe6060f1SDimitry Andric   // the referenced object to avoid violating the code model, and must be
1753a9a9c0cSDimitry Andric   // smaller than 2^20 because this is the largest offset expressible in all
1763a9a9c0cSDimitry Andric   // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
1773a9a9c0cSDimitry Andric   // stores an immediate signed 21 bit offset.)
178fe6060f1SDimitry Andric   //
179fe6060f1SDimitry Andric   // This check also prevents us from folding negative offsets, which will end
180fe6060f1SDimitry Andric   // up being treated in the same way as large positive ones. They could also
181fe6060f1SDimitry Andric   // cause code model violations, and aren't really common enough to matter.
1823a9a9c0cSDimitry Andric   if (NewOffset >= (1 << 20))
183fe6060f1SDimitry Andric     return false;
184fe6060f1SDimitry Andric 
185fe6060f1SDimitry Andric   Type *T = GV->getValueType();
186fe6060f1SDimitry Andric   if (!T->isSized() ||
187*0fca6ea1SDimitry Andric       NewOffset > GV->getDataLayout().getTypeAllocSize(T))
188fe6060f1SDimitry Andric     return false;
189fe6060f1SDimitry Andric   MatchInfo = std::make_pair(NewOffset, MinOffset);
190fe6060f1SDimitry Andric   return true;
191fe6060f1SDimitry Andric }
192fe6060f1SDimitry Andric 
19306c3fb27SDimitry Andric void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
19406c3fb27SDimitry Andric                            MachineIRBuilder &B, GISelChangeObserver &Observer,
195fe6060f1SDimitry Andric                            std::pair<uint64_t, uint64_t> &MatchInfo) {
196fe6060f1SDimitry Andric   // Change:
197fe6060f1SDimitry Andric   //
198fe6060f1SDimitry Andric   //  %g = G_GLOBAL_VALUE @x
199fe6060f1SDimitry Andric   //  %ptr1 = G_PTR_ADD %g, cst1
200fe6060f1SDimitry Andric   //  %ptr2 = G_PTR_ADD %g, cst2
201fe6060f1SDimitry Andric   //  ...
202fe6060f1SDimitry Andric   //  %ptrN = G_PTR_ADD %g, cstN
203fe6060f1SDimitry Andric   //
204fe6060f1SDimitry Andric   // To:
205fe6060f1SDimitry Andric   //
206fe6060f1SDimitry Andric   //  %offset_g = G_GLOBAL_VALUE @x + min_cst
207fe6060f1SDimitry Andric   //  %g = G_PTR_ADD %offset_g, -min_cst
208fe6060f1SDimitry Andric   //  %ptr1 = G_PTR_ADD %g, cst1
209fe6060f1SDimitry Andric   //  ...
210fe6060f1SDimitry Andric   //  %ptrN = G_PTR_ADD %g, cstN
211fe6060f1SDimitry Andric   //
212fe6060f1SDimitry Andric   // Then, the original G_PTR_ADDs should be folded later on so that they look
213fe6060f1SDimitry Andric   // like this:
214fe6060f1SDimitry Andric   //
215fe6060f1SDimitry Andric   //  %ptrN = G_PTR_ADD %offset_g, cstN - min_cst
216fe6060f1SDimitry Andric   uint64_t Offset, MinOffset;
217fe6060f1SDimitry Andric   std::tie(Offset, MinOffset) = MatchInfo;
2185f757f3fSDimitry Andric   B.setInstrAndDebugLoc(*std::next(MI.getIterator()));
219fe6060f1SDimitry Andric   Observer.changingInstr(MI);
220fe6060f1SDimitry Andric   auto &GlobalOp = MI.getOperand(1);
221fe6060f1SDimitry Andric   auto *GV = GlobalOp.getGlobal();
222fe6060f1SDimitry Andric   GlobalOp.ChangeToGA(GV, Offset, GlobalOp.getTargetFlags());
223fe6060f1SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
224fe6060f1SDimitry Andric   Register NewGVDst = MRI.cloneVirtualRegister(Dst);
225fe6060f1SDimitry Andric   MI.getOperand(0).setReg(NewGVDst);
226fe6060f1SDimitry Andric   Observer.changedInstr(MI);
227fe6060f1SDimitry Andric   B.buildPtrAdd(
228fe6060f1SDimitry Andric       Dst, NewGVDst,
229fe6060f1SDimitry Andric       B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset)));
230fe6060f1SDimitry Andric }
231fe6060f1SDimitry Andric 
2325f757f3fSDimitry Andric // Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y))
2335f757f3fSDimitry Andric // Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1))
2345f757f3fSDimitry Andric // Similar to performVecReduceAddCombine in SelectionDAG
2355f757f3fSDimitry Andric bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
2365f757f3fSDimitry Andric                             const AArch64Subtarget &STI,
2375f757f3fSDimitry Andric                             std::tuple<Register, Register, bool> &MatchInfo) {
2385f757f3fSDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
2395f757f3fSDimitry Andric          "Expected a G_VECREDUCE_ADD instruction");
2405f757f3fSDimitry Andric   assert(STI.hasDotProd() && "Target should have Dot Product feature");
2415f757f3fSDimitry Andric 
2425f757f3fSDimitry Andric   MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
2435f757f3fSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2445f757f3fSDimitry Andric   Register MidReg = I1->getOperand(0).getReg();
2455f757f3fSDimitry Andric   LLT DstTy = MRI.getType(DstReg);
2465f757f3fSDimitry Andric   LLT MidTy = MRI.getType(MidReg);
2475f757f3fSDimitry Andric   if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32)
2485f757f3fSDimitry Andric     return false;
2495f757f3fSDimitry Andric 
2505f757f3fSDimitry Andric   LLT SrcTy;
2515f757f3fSDimitry Andric   auto I1Opc = I1->getOpcode();
2525f757f3fSDimitry Andric   if (I1Opc == TargetOpcode::G_MUL) {
2535f757f3fSDimitry Andric     // If result of this has more than 1 use, then there is no point in creating
2545f757f3fSDimitry Andric     // udot instruction
2555f757f3fSDimitry Andric     if (!MRI.hasOneNonDBGUse(MidReg))
2565f757f3fSDimitry Andric       return false;
2575f757f3fSDimitry Andric 
2585f757f3fSDimitry Andric     MachineInstr *ExtMI1 =
2595f757f3fSDimitry Andric         getDefIgnoringCopies(I1->getOperand(1).getReg(), MRI);
2605f757f3fSDimitry Andric     MachineInstr *ExtMI2 =
2615f757f3fSDimitry Andric         getDefIgnoringCopies(I1->getOperand(2).getReg(), MRI);
2625f757f3fSDimitry Andric     LLT Ext1DstTy = MRI.getType(ExtMI1->getOperand(0).getReg());
2635f757f3fSDimitry Andric     LLT Ext2DstTy = MRI.getType(ExtMI2->getOperand(0).getReg());
2645f757f3fSDimitry Andric 
2655f757f3fSDimitry Andric     if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy)
2665f757f3fSDimitry Andric       return false;
2675f757f3fSDimitry Andric     I1Opc = ExtMI1->getOpcode();
2685f757f3fSDimitry Andric     SrcTy = MRI.getType(ExtMI1->getOperand(1).getReg());
2695f757f3fSDimitry Andric     std::get<0>(MatchInfo) = ExtMI1->getOperand(1).getReg();
2705f757f3fSDimitry Andric     std::get<1>(MatchInfo) = ExtMI2->getOperand(1).getReg();
2715f757f3fSDimitry Andric   } else {
2725f757f3fSDimitry Andric     SrcTy = MRI.getType(I1->getOperand(1).getReg());
2735f757f3fSDimitry Andric     std::get<0>(MatchInfo) = I1->getOperand(1).getReg();
2745f757f3fSDimitry Andric     std::get<1>(MatchInfo) = 0;
2755f757f3fSDimitry Andric   }
2765f757f3fSDimitry Andric 
2775f757f3fSDimitry Andric   if (I1Opc == TargetOpcode::G_ZEXT)
2785f757f3fSDimitry Andric     std::get<2>(MatchInfo) = 0;
2795f757f3fSDimitry Andric   else if (I1Opc == TargetOpcode::G_SEXT)
2805f757f3fSDimitry Andric     std::get<2>(MatchInfo) = 1;
2815f757f3fSDimitry Andric   else
2825f757f3fSDimitry Andric     return false;
2835f757f3fSDimitry Andric 
2845f757f3fSDimitry Andric   if (SrcTy.getScalarSizeInBits() != 8 || SrcTy.getNumElements() % 8 != 0)
2855f757f3fSDimitry Andric     return false;
2865f757f3fSDimitry Andric 
2875f757f3fSDimitry Andric   return true;
2885f757f3fSDimitry Andric }
2895f757f3fSDimitry Andric 
2905f757f3fSDimitry Andric void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
2915f757f3fSDimitry Andric                             MachineIRBuilder &Builder,
2925f757f3fSDimitry Andric                             GISelChangeObserver &Observer,
2935f757f3fSDimitry Andric                             const AArch64Subtarget &STI,
2945f757f3fSDimitry Andric                             std::tuple<Register, Register, bool> &MatchInfo) {
2955f757f3fSDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
2965f757f3fSDimitry Andric          "Expected a G_VECREDUCE_ADD instruction");
2975f757f3fSDimitry Andric   assert(STI.hasDotProd() && "Target should have Dot Product feature");
2985f757f3fSDimitry Andric 
2995f757f3fSDimitry Andric   // Initialise the variables
3005f757f3fSDimitry Andric   unsigned DotOpcode =
3015f757f3fSDimitry Andric       std::get<2>(MatchInfo) ? AArch64::G_SDOT : AArch64::G_UDOT;
3025f757f3fSDimitry Andric   Register Ext1SrcReg = std::get<0>(MatchInfo);
3035f757f3fSDimitry Andric 
3045f757f3fSDimitry Andric   // If there is one source register, create a vector of 0s as the second
3055f757f3fSDimitry Andric   // source register
3065f757f3fSDimitry Andric   Register Ext2SrcReg;
3075f757f3fSDimitry Andric   if (std::get<1>(MatchInfo) == 0)
3085f757f3fSDimitry Andric     Ext2SrcReg = Builder.buildConstant(MRI.getType(Ext1SrcReg), 1)
3095f757f3fSDimitry Andric                      ->getOperand(0)
3105f757f3fSDimitry Andric                      .getReg();
3115f757f3fSDimitry Andric   else
3125f757f3fSDimitry Andric     Ext2SrcReg = std::get<1>(MatchInfo);
3135f757f3fSDimitry Andric 
3145f757f3fSDimitry Andric   // Find out how many DOT instructions are needed
3155f757f3fSDimitry Andric   LLT SrcTy = MRI.getType(Ext1SrcReg);
3165f757f3fSDimitry Andric   LLT MidTy;
3175f757f3fSDimitry Andric   unsigned NumOfDotMI;
3185f757f3fSDimitry Andric   if (SrcTy.getNumElements() % 16 == 0) {
3195f757f3fSDimitry Andric     NumOfDotMI = SrcTy.getNumElements() / 16;
3205f757f3fSDimitry Andric     MidTy = LLT::fixed_vector(4, 32);
3215f757f3fSDimitry Andric   } else if (SrcTy.getNumElements() % 8 == 0) {
3225f757f3fSDimitry Andric     NumOfDotMI = SrcTy.getNumElements() / 8;
3235f757f3fSDimitry Andric     MidTy = LLT::fixed_vector(2, 32);
3245f757f3fSDimitry Andric   } else {
3255f757f3fSDimitry Andric     llvm_unreachable("Source type number of elements is not multiple of 8");
3265f757f3fSDimitry Andric   }
3275f757f3fSDimitry Andric 
3285f757f3fSDimitry Andric   // Handle case where one DOT instruction is needed
3295f757f3fSDimitry Andric   if (NumOfDotMI == 1) {
3305f757f3fSDimitry Andric     auto Zeroes = Builder.buildConstant(MidTy, 0)->getOperand(0).getReg();
3315f757f3fSDimitry Andric     auto Dot = Builder.buildInstr(DotOpcode, {MidTy},
3325f757f3fSDimitry Andric                                   {Zeroes, Ext1SrcReg, Ext2SrcReg});
3335f757f3fSDimitry Andric     Builder.buildVecReduceAdd(MI.getOperand(0), Dot->getOperand(0));
3345f757f3fSDimitry Andric   } else {
3355f757f3fSDimitry Andric     // If not pad the last v8 element with 0s to a v16
3365f757f3fSDimitry Andric     SmallVector<Register, 4> Ext1UnmergeReg;
3375f757f3fSDimitry Andric     SmallVector<Register, 4> Ext2UnmergeReg;
3385f757f3fSDimitry Andric     if (SrcTy.getNumElements() % 16 != 0) {
3397a6dacacSDimitry Andric       SmallVector<Register> Leftover1;
3407a6dacacSDimitry Andric       SmallVector<Register> Leftover2;
3415f757f3fSDimitry Andric 
3427a6dacacSDimitry Andric       // Split the elements into v16i8 and v8i8
3437a6dacacSDimitry Andric       LLT MainTy = LLT::fixed_vector(16, 8);
3447a6dacacSDimitry Andric       LLT LeftoverTy1, LeftoverTy2;
3457a6dacacSDimitry Andric       if ((!extractParts(Ext1SrcReg, MRI.getType(Ext1SrcReg), MainTy,
3467a6dacacSDimitry Andric                          LeftoverTy1, Ext1UnmergeReg, Leftover1, Builder,
3477a6dacacSDimitry Andric                          MRI)) ||
3487a6dacacSDimitry Andric           (!extractParts(Ext2SrcReg, MRI.getType(Ext2SrcReg), MainTy,
3497a6dacacSDimitry Andric                          LeftoverTy2, Ext2UnmergeReg, Leftover2, Builder,
3507a6dacacSDimitry Andric                          MRI))) {
3517a6dacacSDimitry Andric         llvm_unreachable("Unable to split this vector properly");
3525f757f3fSDimitry Andric       }
3535f757f3fSDimitry Andric 
3547a6dacacSDimitry Andric       // Pad the leftover v8i8 vector with register of 0s of type v8i8
3557a6dacacSDimitry Andric       Register v8Zeroes = Builder.buildConstant(LLT::fixed_vector(8, 8), 0)
3567a6dacacSDimitry Andric                               ->getOperand(0)
3577a6dacacSDimitry Andric                               .getReg();
3585f757f3fSDimitry Andric 
3595f757f3fSDimitry Andric       Ext1UnmergeReg.push_back(
3605f757f3fSDimitry Andric           Builder
3617a6dacacSDimitry Andric               .buildMergeLikeInstr(LLT::fixed_vector(16, 8),
3627a6dacacSDimitry Andric                                    {Leftover1[0], v8Zeroes})
3635f757f3fSDimitry Andric               .getReg(0));
3645f757f3fSDimitry Andric       Ext2UnmergeReg.push_back(
3655f757f3fSDimitry Andric           Builder
3667a6dacacSDimitry Andric               .buildMergeLikeInstr(LLT::fixed_vector(16, 8),
3677a6dacacSDimitry Andric                                    {Leftover2[0], v8Zeroes})
3685f757f3fSDimitry Andric               .getReg(0));
3697a6dacacSDimitry Andric 
3705f757f3fSDimitry Andric     } else {
3715f757f3fSDimitry Andric       // Unmerge the source vectors to v16i8
3727a6dacacSDimitry Andric       unsigned SrcNumElts = SrcTy.getNumElements();
3737a6dacacSDimitry Andric       extractParts(Ext1SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16,
3747a6dacacSDimitry Andric                    Ext1UnmergeReg, Builder, MRI);
3757a6dacacSDimitry Andric       extractParts(Ext2SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16,
3767a6dacacSDimitry Andric                    Ext2UnmergeReg, Builder, MRI);
3775f757f3fSDimitry Andric     }
3785f757f3fSDimitry Andric 
3795f757f3fSDimitry Andric     // Build the UDOT instructions
3805f757f3fSDimitry Andric     SmallVector<Register, 2> DotReg;
3815f757f3fSDimitry Andric     unsigned NumElements = 0;
3825f757f3fSDimitry Andric     for (unsigned i = 0; i < Ext1UnmergeReg.size(); i++) {
3835f757f3fSDimitry Andric       LLT ZeroesLLT;
3845f757f3fSDimitry Andric       // Check if it is 16 or 8 elements. Set Zeroes to the according size
3855f757f3fSDimitry Andric       if (MRI.getType(Ext1UnmergeReg[i]).getNumElements() == 16) {
3865f757f3fSDimitry Andric         ZeroesLLT = LLT::fixed_vector(4, 32);
3875f757f3fSDimitry Andric         NumElements += 4;
3885f757f3fSDimitry Andric       } else {
3895f757f3fSDimitry Andric         ZeroesLLT = LLT::fixed_vector(2, 32);
3905f757f3fSDimitry Andric         NumElements += 2;
3915f757f3fSDimitry Andric       }
3925f757f3fSDimitry Andric       auto Zeroes = Builder.buildConstant(ZeroesLLT, 0)->getOperand(0).getReg();
3935f757f3fSDimitry Andric       DotReg.push_back(
3945f757f3fSDimitry Andric           Builder
3955f757f3fSDimitry Andric               .buildInstr(DotOpcode, {MRI.getType(Zeroes)},
3965f757f3fSDimitry Andric                           {Zeroes, Ext1UnmergeReg[i], Ext2UnmergeReg[i]})
3975f757f3fSDimitry Andric               .getReg(0));
3985f757f3fSDimitry Andric     }
3995f757f3fSDimitry Andric 
4005f757f3fSDimitry Andric     // Merge the output
4015f757f3fSDimitry Andric     auto ConcatMI =
4025f757f3fSDimitry Andric         Builder.buildConcatVectors(LLT::fixed_vector(NumElements, 32), DotReg);
4035f757f3fSDimitry Andric 
4045f757f3fSDimitry Andric     // Put it through a vector reduction
4055f757f3fSDimitry Andric     Builder.buildVecReduceAdd(MI.getOperand(0).getReg(),
4065f757f3fSDimitry Andric                               ConcatMI->getOperand(0).getReg());
4075f757f3fSDimitry Andric   }
4085f757f3fSDimitry Andric 
4095f757f3fSDimitry Andric   // Erase the dead instructions
4105f757f3fSDimitry Andric   MI.eraseFromParent();
4115f757f3fSDimitry Andric }
4125f757f3fSDimitry Andric 
4137a6dacacSDimitry Andric // Matches {U/S}ADDV(ext(x)) => {U/S}ADDLV(x)
4147a6dacacSDimitry Andric // Ensure that the type coming from the extend instruction is the right size
4157a6dacacSDimitry Andric bool matchExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
4167a6dacacSDimitry Andric                            std::pair<Register, bool> &MatchInfo) {
4177a6dacacSDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
4187a6dacacSDimitry Andric          "Expected G_VECREDUCE_ADD Opcode");
4197a6dacacSDimitry Andric 
4207a6dacacSDimitry Andric   // Check if the last instruction is an extend
4217a6dacacSDimitry Andric   MachineInstr *ExtMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
4227a6dacacSDimitry Andric   auto ExtOpc = ExtMI->getOpcode();
4237a6dacacSDimitry Andric 
4247a6dacacSDimitry Andric   if (ExtOpc == TargetOpcode::G_ZEXT)
4257a6dacacSDimitry Andric     std::get<1>(MatchInfo) = 0;
4267a6dacacSDimitry Andric   else if (ExtOpc == TargetOpcode::G_SEXT)
4277a6dacacSDimitry Andric     std::get<1>(MatchInfo) = 1;
4287a6dacacSDimitry Andric   else
4297a6dacacSDimitry Andric     return false;
4307a6dacacSDimitry Andric 
4317a6dacacSDimitry Andric   // Check if the source register is a valid type
4327a6dacacSDimitry Andric   Register ExtSrcReg = ExtMI->getOperand(1).getReg();
4337a6dacacSDimitry Andric   LLT ExtSrcTy = MRI.getType(ExtSrcReg);
4347a6dacacSDimitry Andric   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
4357a6dacacSDimitry Andric   if ((DstTy.getScalarSizeInBits() == 16 &&
4367a6dacacSDimitry Andric        ExtSrcTy.getNumElements() % 8 == 0 && ExtSrcTy.getNumElements() < 256) ||
4377a6dacacSDimitry Andric       (DstTy.getScalarSizeInBits() == 32 &&
4387a6dacacSDimitry Andric        ExtSrcTy.getNumElements() % 4 == 0) ||
4397a6dacacSDimitry Andric       (DstTy.getScalarSizeInBits() == 64 &&
4407a6dacacSDimitry Andric        ExtSrcTy.getNumElements() % 4 == 0)) {
4417a6dacacSDimitry Andric     std::get<0>(MatchInfo) = ExtSrcReg;
4427a6dacacSDimitry Andric     return true;
4437a6dacacSDimitry Andric   }
4447a6dacacSDimitry Andric   return false;
4457a6dacacSDimitry Andric }
4467a6dacacSDimitry Andric 
4477a6dacacSDimitry Andric void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
4487a6dacacSDimitry Andric                            MachineIRBuilder &B, GISelChangeObserver &Observer,
4497a6dacacSDimitry Andric                            std::pair<Register, bool> &MatchInfo) {
4507a6dacacSDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
4517a6dacacSDimitry Andric          "Expected G_VECREDUCE_ADD Opcode");
4527a6dacacSDimitry Andric 
4537a6dacacSDimitry Andric   unsigned Opc = std::get<1>(MatchInfo) ? AArch64::G_SADDLV : AArch64::G_UADDLV;
4547a6dacacSDimitry Andric   Register SrcReg = std::get<0>(MatchInfo);
4557a6dacacSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
4567a6dacacSDimitry Andric   LLT SrcTy = MRI.getType(SrcReg);
4577a6dacacSDimitry Andric   LLT DstTy = MRI.getType(DstReg);
4587a6dacacSDimitry Andric 
4597a6dacacSDimitry Andric   // If SrcTy has more elements than expected, split them into multiple
4607a6dacacSDimitry Andric   // insructions and sum the results
4617a6dacacSDimitry Andric   LLT MainTy;
4627a6dacacSDimitry Andric   SmallVector<Register, 1> WorkingRegisters;
4637a6dacacSDimitry Andric   unsigned SrcScalSize = SrcTy.getScalarSizeInBits();
4647a6dacacSDimitry Andric   unsigned SrcNumElem = SrcTy.getNumElements();
4657a6dacacSDimitry Andric   if ((SrcScalSize == 8 && SrcNumElem > 16) ||
4667a6dacacSDimitry Andric       (SrcScalSize == 16 && SrcNumElem > 8) ||
4677a6dacacSDimitry Andric       (SrcScalSize == 32 && SrcNumElem > 4)) {
4687a6dacacSDimitry Andric 
4697a6dacacSDimitry Andric     LLT LeftoverTy;
4707a6dacacSDimitry Andric     SmallVector<Register, 4> LeftoverRegs;
4717a6dacacSDimitry Andric     if (SrcScalSize == 8)
4727a6dacacSDimitry Andric       MainTy = LLT::fixed_vector(16, 8);
4737a6dacacSDimitry Andric     else if (SrcScalSize == 16)
4747a6dacacSDimitry Andric       MainTy = LLT::fixed_vector(8, 16);
4757a6dacacSDimitry Andric     else if (SrcScalSize == 32)
4767a6dacacSDimitry Andric       MainTy = LLT::fixed_vector(4, 32);
4777a6dacacSDimitry Andric     else
4787a6dacacSDimitry Andric       llvm_unreachable("Source's Scalar Size not supported");
4797a6dacacSDimitry Andric 
4807a6dacacSDimitry Andric     // Extract the parts and put each extracted sources through U/SADDLV and put
4817a6dacacSDimitry Andric     // the values inside a small vec
4827a6dacacSDimitry Andric     extractParts(SrcReg, SrcTy, MainTy, LeftoverTy, WorkingRegisters,
4837a6dacacSDimitry Andric                  LeftoverRegs, B, MRI);
4847a6dacacSDimitry Andric     for (unsigned I = 0; I < LeftoverRegs.size(); I++) {
4857a6dacacSDimitry Andric       WorkingRegisters.push_back(LeftoverRegs[I]);
4867a6dacacSDimitry Andric     }
4877a6dacacSDimitry Andric   } else {
4887a6dacacSDimitry Andric     WorkingRegisters.push_back(SrcReg);
4897a6dacacSDimitry Andric     MainTy = SrcTy;
4907a6dacacSDimitry Andric   }
4917a6dacacSDimitry Andric 
4927a6dacacSDimitry Andric   unsigned MidScalarSize = MainTy.getScalarSizeInBits() * 2;
4937a6dacacSDimitry Andric   LLT MidScalarLLT = LLT::scalar(MidScalarSize);
4947a6dacacSDimitry Andric   Register zeroReg = B.buildConstant(LLT::scalar(64), 0).getReg(0);
4957a6dacacSDimitry Andric   for (unsigned I = 0; I < WorkingRegisters.size(); I++) {
4967a6dacacSDimitry Andric     // If the number of elements is too small to build an instruction, extend
4977a6dacacSDimitry Andric     // its size before applying addlv
4987a6dacacSDimitry Andric     LLT WorkingRegTy = MRI.getType(WorkingRegisters[I]);
4997a6dacacSDimitry Andric     if ((WorkingRegTy.getScalarSizeInBits() == 8) &&
5007a6dacacSDimitry Andric         (WorkingRegTy.getNumElements() == 4)) {
5017a6dacacSDimitry Andric       WorkingRegisters[I] =
5027a6dacacSDimitry Andric           B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT
5037a6dacacSDimitry Andric                                               : TargetOpcode::G_ZEXT,
5047a6dacacSDimitry Andric                        {LLT::fixed_vector(4, 16)}, {WorkingRegisters[I]})
5057a6dacacSDimitry Andric               .getReg(0);
5067a6dacacSDimitry Andric     }
5077a6dacacSDimitry Andric 
5087a6dacacSDimitry Andric     // Generate the {U/S}ADDLV instruction, whose output is always double of the
5097a6dacacSDimitry Andric     // Src's Scalar size
5107a6dacacSDimitry Andric     LLT addlvTy = MidScalarSize <= 32 ? LLT::fixed_vector(4, 32)
5117a6dacacSDimitry Andric                                       : LLT::fixed_vector(2, 64);
5127a6dacacSDimitry Andric     Register addlvReg =
5137a6dacacSDimitry Andric         B.buildInstr(Opc, {addlvTy}, {WorkingRegisters[I]}).getReg(0);
5147a6dacacSDimitry Andric 
5157a6dacacSDimitry Andric     // The output from {U/S}ADDLV gets placed in the lowest lane of a v4i32 or
5167a6dacacSDimitry Andric     // v2i64 register.
5177a6dacacSDimitry Andric     //     i16, i32 results uses v4i32 registers
5187a6dacacSDimitry Andric     //     i64      results uses v2i64 registers
5197a6dacacSDimitry Andric     // Therefore we have to extract/truncate the the value to the right type
5207a6dacacSDimitry Andric     if (MidScalarSize == 32 || MidScalarSize == 64) {
5217a6dacacSDimitry Andric       WorkingRegisters[I] = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT,
5227a6dacacSDimitry Andric                                          {MidScalarLLT}, {addlvReg, zeroReg})
5237a6dacacSDimitry Andric                                 .getReg(0);
5247a6dacacSDimitry Andric     } else {
5257a6dacacSDimitry Andric       Register extractReg = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT,
5267a6dacacSDimitry Andric                                          {LLT::scalar(32)}, {addlvReg, zeroReg})
5277a6dacacSDimitry Andric                                 .getReg(0);
5287a6dacacSDimitry Andric       WorkingRegisters[I] =
5297a6dacacSDimitry Andric           B.buildTrunc({MidScalarLLT}, {extractReg}).getReg(0);
5307a6dacacSDimitry Andric     }
5317a6dacacSDimitry Andric   }
5327a6dacacSDimitry Andric 
5337a6dacacSDimitry Andric   Register outReg;
5347a6dacacSDimitry Andric   if (WorkingRegisters.size() > 1) {
5357a6dacacSDimitry Andric     outReg = B.buildAdd(MidScalarLLT, WorkingRegisters[0], WorkingRegisters[1])
5367a6dacacSDimitry Andric                  .getReg(0);
5377a6dacacSDimitry Andric     for (unsigned I = 2; I < WorkingRegisters.size(); I++) {
5387a6dacacSDimitry Andric       outReg = B.buildAdd(MidScalarLLT, outReg, WorkingRegisters[I]).getReg(0);
5397a6dacacSDimitry Andric     }
5407a6dacacSDimitry Andric   } else {
5417a6dacacSDimitry Andric     outReg = WorkingRegisters[0];
5427a6dacacSDimitry Andric   }
5437a6dacacSDimitry Andric 
5447a6dacacSDimitry Andric   if (DstTy.getScalarSizeInBits() > MidScalarSize) {
5457a6dacacSDimitry Andric     // Handle the scalar value if the DstTy's Scalar Size is more than double
5467a6dacacSDimitry Andric     // Src's ScalarType
5477a6dacacSDimitry Andric     B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT
5487a6dacacSDimitry Andric                                         : TargetOpcode::G_ZEXT,
5497a6dacacSDimitry Andric                  {DstReg}, {outReg});
5507a6dacacSDimitry Andric   } else {
5517a6dacacSDimitry Andric     B.buildCopy(DstReg, outReg);
5527a6dacacSDimitry Andric   }
5537a6dacacSDimitry Andric 
5547a6dacacSDimitry Andric   MI.eraseFromParent();
5557a6dacacSDimitry Andric }
5567a6dacacSDimitry Andric 
557*0fca6ea1SDimitry Andric // Pushes ADD/SUB through extend instructions to decrease the number of extend
558*0fca6ea1SDimitry Andric // instruction at the end by allowing selection of {s|u}addl sooner
559*0fca6ea1SDimitry Andric 
560*0fca6ea1SDimitry Andric // i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8))
561*0fca6ea1SDimitry Andric bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
562*0fca6ea1SDimitry Andric                         Register DstReg, Register SrcReg1, Register SrcReg2) {
563*0fca6ea1SDimitry Andric   assert((MI.getOpcode() == TargetOpcode::G_ADD ||
564*0fca6ea1SDimitry Andric           MI.getOpcode() == TargetOpcode::G_SUB) &&
565*0fca6ea1SDimitry Andric          "Expected a G_ADD or G_SUB instruction\n");
566*0fca6ea1SDimitry Andric 
567*0fca6ea1SDimitry Andric   // Deal with vector types only
568*0fca6ea1SDimitry Andric   LLT DstTy = MRI.getType(DstReg);
569*0fca6ea1SDimitry Andric   if (!DstTy.isVector())
570*0fca6ea1SDimitry Andric     return false;
571*0fca6ea1SDimitry Andric 
572*0fca6ea1SDimitry Andric   // Return true if G_{S|Z}EXT instruction is more than 2* source
573*0fca6ea1SDimitry Andric   Register ExtDstReg = MI.getOperand(1).getReg();
574*0fca6ea1SDimitry Andric   LLT Ext1SrcTy = MRI.getType(SrcReg1);
575*0fca6ea1SDimitry Andric   LLT Ext2SrcTy = MRI.getType(SrcReg2);
576*0fca6ea1SDimitry Andric   unsigned ExtDstScal = MRI.getType(ExtDstReg).getScalarSizeInBits();
577*0fca6ea1SDimitry Andric   unsigned Ext1SrcScal = Ext1SrcTy.getScalarSizeInBits();
578*0fca6ea1SDimitry Andric   if (((Ext1SrcScal == 8 && ExtDstScal == 32) ||
579*0fca6ea1SDimitry Andric        ((Ext1SrcScal == 8 || Ext1SrcScal == 16) && ExtDstScal == 64)) &&
580*0fca6ea1SDimitry Andric       Ext1SrcTy == Ext2SrcTy)
581*0fca6ea1SDimitry Andric     return true;
582*0fca6ea1SDimitry Andric 
583*0fca6ea1SDimitry Andric   return false;
584*0fca6ea1SDimitry Andric }
585*0fca6ea1SDimitry Andric 
586*0fca6ea1SDimitry Andric void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
587*0fca6ea1SDimitry Andric                         MachineIRBuilder &B, bool isSExt, Register DstReg,
588*0fca6ea1SDimitry Andric                         Register SrcReg1, Register SrcReg2) {
589*0fca6ea1SDimitry Andric   LLT SrcTy = MRI.getType(SrcReg1);
590*0fca6ea1SDimitry Andric   LLT MidTy = SrcTy.changeElementSize(SrcTy.getScalarSizeInBits() * 2);
591*0fca6ea1SDimitry Andric   unsigned Opc = isSExt ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
592*0fca6ea1SDimitry Andric   Register Ext1Reg = B.buildInstr(Opc, {MidTy}, {SrcReg1}).getReg(0);
593*0fca6ea1SDimitry Andric   Register Ext2Reg = B.buildInstr(Opc, {MidTy}, {SrcReg2}).getReg(0);
594*0fca6ea1SDimitry Andric   Register AddReg =
595*0fca6ea1SDimitry Andric       B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0);
596*0fca6ea1SDimitry Andric 
597*0fca6ea1SDimitry Andric   // G_SUB has to sign-extend the result.
598*0fca6ea1SDimitry Andric   // G_ADD needs to sext from sext and can sext or zext from zext, so the
599*0fca6ea1SDimitry Andric   // original opcode is used.
600*0fca6ea1SDimitry Andric   if (MI.getOpcode() == TargetOpcode::G_ADD)
601*0fca6ea1SDimitry Andric     B.buildInstr(Opc, {DstReg}, {AddReg});
602*0fca6ea1SDimitry Andric   else
603*0fca6ea1SDimitry Andric     B.buildSExt(DstReg, AddReg);
604*0fca6ea1SDimitry Andric 
605*0fca6ea1SDimitry Andric   MI.eraseFromParent();
606*0fca6ea1SDimitry Andric }
607*0fca6ea1SDimitry Andric 
60806c3fb27SDimitry Andric bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
60906c3fb27SDimitry Andric                         CombinerHelper &Helper, GISelChangeObserver &Observer) {
610349cc55cSDimitry Andric   // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
611349cc55cSDimitry Andric   // result is only used in the no-overflow case. It is restricted to cases
612349cc55cSDimitry Andric   // where we know that the high-bits of the operands are 0. If there's an
6135f757f3fSDimitry Andric   // overflow, then the 9th or 17th bit must be set, which can be checked
614349cc55cSDimitry Andric   // using TBNZ.
615349cc55cSDimitry Andric   //
616349cc55cSDimitry Andric   // Change (for UADDOs on 8 and 16 bits):
617349cc55cSDimitry Andric   //
618349cc55cSDimitry Andric   //   %z0 = G_ASSERT_ZEXT _
619349cc55cSDimitry Andric   //   %op0 = G_TRUNC %z0
620349cc55cSDimitry Andric   //   %z1 = G_ASSERT_ZEXT _
621349cc55cSDimitry Andric   //   %op1 = G_TRUNC %z1
622349cc55cSDimitry Andric   //   %val, %cond = G_UADDO %op0, %op1
623349cc55cSDimitry Andric   //   G_BRCOND %cond, %error.bb
624349cc55cSDimitry Andric   //
625349cc55cSDimitry Andric   // error.bb:
626349cc55cSDimitry Andric   //   (no successors and no uses of %val)
627349cc55cSDimitry Andric   //
628349cc55cSDimitry Andric   // To:
629349cc55cSDimitry Andric   //
630349cc55cSDimitry Andric   //   %z0 = G_ASSERT_ZEXT _
631349cc55cSDimitry Andric   //   %z1 = G_ASSERT_ZEXT _
632349cc55cSDimitry Andric   //   %add = G_ADD %z0, %z1
633349cc55cSDimitry Andric   //   %val = G_TRUNC %add
634349cc55cSDimitry Andric   //   %bit = G_AND %add, 1 << scalar-size-in-bits(%op1)
635349cc55cSDimitry Andric   //   %cond = G_ICMP NE, %bit, 0
636349cc55cSDimitry Andric   //   G_BRCOND %cond, %error.bb
637349cc55cSDimitry Andric 
638349cc55cSDimitry Andric   auto &MRI = *B.getMRI();
639349cc55cSDimitry Andric 
640349cc55cSDimitry Andric   MachineOperand *DefOp0 = MRI.getOneDef(MI.getOperand(2).getReg());
641349cc55cSDimitry Andric   MachineOperand *DefOp1 = MRI.getOneDef(MI.getOperand(3).getReg());
642349cc55cSDimitry Andric   Register Op0Wide;
643349cc55cSDimitry Andric   Register Op1Wide;
644349cc55cSDimitry Andric   if (!mi_match(DefOp0->getParent(), MRI, m_GTrunc(m_Reg(Op0Wide))) ||
645349cc55cSDimitry Andric       !mi_match(DefOp1->getParent(), MRI, m_GTrunc(m_Reg(Op1Wide))))
646349cc55cSDimitry Andric     return false;
647349cc55cSDimitry Andric   LLT WideTy0 = MRI.getType(Op0Wide);
648349cc55cSDimitry Andric   LLT WideTy1 = MRI.getType(Op1Wide);
649349cc55cSDimitry Andric   Register ResVal = MI.getOperand(0).getReg();
650349cc55cSDimitry Andric   LLT OpTy = MRI.getType(ResVal);
651349cc55cSDimitry Andric   MachineInstr *Op0WideDef = MRI.getVRegDef(Op0Wide);
652349cc55cSDimitry Andric   MachineInstr *Op1WideDef = MRI.getVRegDef(Op1Wide);
653349cc55cSDimitry Andric 
654349cc55cSDimitry Andric   unsigned OpTySize = OpTy.getScalarSizeInBits();
655349cc55cSDimitry Andric   // First check that the G_TRUNC feeding the G_UADDO are no-ops, because the
656349cc55cSDimitry Andric   // inputs have been zero-extended.
657349cc55cSDimitry Andric   if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
658349cc55cSDimitry Andric       Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
659349cc55cSDimitry Andric       OpTySize != Op0WideDef->getOperand(2).getImm() ||
660349cc55cSDimitry Andric       OpTySize != Op1WideDef->getOperand(2).getImm())
661349cc55cSDimitry Andric     return false;
662349cc55cSDimitry Andric 
663349cc55cSDimitry Andric   // Only scalar UADDO with either 8 or 16 bit operands are handled.
664349cc55cSDimitry Andric   if (!WideTy0.isScalar() || !WideTy1.isScalar() || WideTy0 != WideTy1 ||
665349cc55cSDimitry Andric       OpTySize >= WideTy0.getScalarSizeInBits() ||
666349cc55cSDimitry Andric       (OpTySize != 8 && OpTySize != 16))
667349cc55cSDimitry Andric     return false;
668349cc55cSDimitry Andric 
669349cc55cSDimitry Andric   // The overflow-status result must be used by a branch only.
670349cc55cSDimitry Andric   Register ResStatus = MI.getOperand(1).getReg();
671349cc55cSDimitry Andric   if (!MRI.hasOneNonDBGUse(ResStatus))
672349cc55cSDimitry Andric     return false;
673349cc55cSDimitry Andric   MachineInstr *CondUser = &*MRI.use_instr_nodbg_begin(ResStatus);
674349cc55cSDimitry Andric   if (CondUser->getOpcode() != TargetOpcode::G_BRCOND)
675349cc55cSDimitry Andric     return false;
676349cc55cSDimitry Andric 
677349cc55cSDimitry Andric   // Make sure the computed result is only used in the no-overflow blocks.
678349cc55cSDimitry Andric   MachineBasicBlock *CurrentMBB = MI.getParent();
679349cc55cSDimitry Andric   MachineBasicBlock *FailMBB = CondUser->getOperand(1).getMBB();
680349cc55cSDimitry Andric   if (!FailMBB->succ_empty() || CondUser->getParent() != CurrentMBB)
681349cc55cSDimitry Andric     return false;
682349cc55cSDimitry Andric   if (any_of(MRI.use_nodbg_instructions(ResVal),
683349cc55cSDimitry Andric              [&MI, FailMBB, CurrentMBB](MachineInstr &I) {
684349cc55cSDimitry Andric                return &MI != &I &&
685349cc55cSDimitry Andric                       (I.getParent() == FailMBB || I.getParent() == CurrentMBB);
686349cc55cSDimitry Andric              }))
687349cc55cSDimitry Andric     return false;
688349cc55cSDimitry Andric 
689349cc55cSDimitry Andric   // Remove G_ADDO.
690349cc55cSDimitry Andric   B.setInstrAndDebugLoc(*MI.getNextNode());
691349cc55cSDimitry Andric   MI.eraseFromParent();
692349cc55cSDimitry Andric 
693349cc55cSDimitry Andric   // Emit wide add.
694349cc55cSDimitry Andric   Register AddDst = MRI.cloneVirtualRegister(Op0Wide);
695349cc55cSDimitry Andric   B.buildInstr(TargetOpcode::G_ADD, {AddDst}, {Op0Wide, Op1Wide});
696349cc55cSDimitry Andric 
697349cc55cSDimitry Andric   // Emit check of the 9th or 17th bit and update users (the branch). This will
698349cc55cSDimitry Andric   // later be folded to TBNZ.
699349cc55cSDimitry Andric   Register CondBit = MRI.cloneVirtualRegister(Op0Wide);
700349cc55cSDimitry Andric   B.buildAnd(
701349cc55cSDimitry Andric       CondBit, AddDst,
702349cc55cSDimitry Andric       B.buildConstant(LLT::scalar(32), OpTySize == 8 ? 1 << 8 : 1 << 16));
703349cc55cSDimitry Andric   B.buildICmp(CmpInst::ICMP_NE, ResStatus, CondBit,
704349cc55cSDimitry Andric               B.buildConstant(LLT::scalar(32), 0));
705349cc55cSDimitry Andric 
706349cc55cSDimitry Andric   // Update ZEXts users of the result value. Because all uses are in the
707349cc55cSDimitry Andric   // no-overflow case, we know that the top bits are 0 and we can ignore ZExts.
708349cc55cSDimitry Andric   B.buildZExtOrTrunc(ResVal, AddDst);
709349cc55cSDimitry Andric   for (MachineOperand &U : make_early_inc_range(MRI.use_operands(ResVal))) {
710349cc55cSDimitry Andric     Register WideReg;
711349cc55cSDimitry Andric     if (mi_match(U.getParent(), MRI, m_GZExt(m_Reg(WideReg)))) {
712349cc55cSDimitry Andric       auto OldR = U.getParent()->getOperand(0).getReg();
713349cc55cSDimitry Andric       Observer.erasingInstr(*U.getParent());
714349cc55cSDimitry Andric       U.getParent()->eraseFromParent();
715349cc55cSDimitry Andric       Helper.replaceRegWith(MRI, OldR, AddDst);
716349cc55cSDimitry Andric     }
717349cc55cSDimitry Andric   }
718349cc55cSDimitry Andric 
719349cc55cSDimitry Andric   return true;
720349cc55cSDimitry Andric }
721349cc55cSDimitry Andric 
7225f757f3fSDimitry Andric class AArch64PreLegalizerCombinerImpl : public Combiner {
7235ffd83dbSDimitry Andric protected:
7245f757f3fSDimitry Andric   // TODO: Make CombinerHelper methods const.
7255f757f3fSDimitry Andric   mutable CombinerHelper Helper;
72606c3fb27SDimitry Andric   const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig;
72706c3fb27SDimitry Andric   const AArch64Subtarget &STI;
7285ffd83dbSDimitry Andric 
7295ffd83dbSDimitry Andric public:
73006c3fb27SDimitry Andric   AArch64PreLegalizerCombinerImpl(
7315f757f3fSDimitry Andric       MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
7325f757f3fSDimitry Andric       GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
73306c3fb27SDimitry Andric       const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
7345f757f3fSDimitry Andric       const AArch64Subtarget &STI, MachineDominatorTree *MDT,
7355f757f3fSDimitry Andric       const LegalizerInfo *LI);
73606c3fb27SDimitry Andric 
73706c3fb27SDimitry Andric   static const char *getName() { return "AArch6400PreLegalizerCombiner"; }
73806c3fb27SDimitry Andric 
7395f757f3fSDimitry Andric   bool tryCombineAll(MachineInstr &I) const override;
7405f757f3fSDimitry Andric 
7415f757f3fSDimitry Andric   bool tryCombineAllImpl(MachineInstr &I) const;
74206c3fb27SDimitry Andric 
74306c3fb27SDimitry Andric private:
74406c3fb27SDimitry Andric #define GET_GICOMBINER_CLASS_MEMBERS
74506c3fb27SDimitry Andric #include "AArch64GenPreLegalizeGICombiner.inc"
74606c3fb27SDimitry Andric #undef GET_GICOMBINER_CLASS_MEMBERS
7475ffd83dbSDimitry Andric };
7485ffd83dbSDimitry Andric 
74906c3fb27SDimitry Andric #define GET_GICOMBINER_IMPL
7505ffd83dbSDimitry Andric #include "AArch64GenPreLegalizeGICombiner.inc"
75106c3fb27SDimitry Andric #undef GET_GICOMBINER_IMPL
7525ffd83dbSDimitry Andric 
75306c3fb27SDimitry Andric AArch64PreLegalizerCombinerImpl::AArch64PreLegalizerCombinerImpl(
7545f757f3fSDimitry Andric     MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
7555f757f3fSDimitry Andric     GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
75606c3fb27SDimitry Andric     const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
7575f757f3fSDimitry Andric     const AArch64Subtarget &STI, MachineDominatorTree *MDT,
7585f757f3fSDimitry Andric     const LegalizerInfo *LI)
7595f757f3fSDimitry Andric     : Combiner(MF, CInfo, TPC, &KB, CSEInfo),
7605f757f3fSDimitry Andric       Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI),
7615f757f3fSDimitry Andric       RuleConfig(RuleConfig), STI(STI),
76206c3fb27SDimitry Andric #define GET_GICOMBINER_CONSTRUCTOR_INITS
7635ffd83dbSDimitry Andric #include "AArch64GenPreLegalizeGICombiner.inc"
76406c3fb27SDimitry Andric #undef GET_GICOMBINER_CONSTRUCTOR_INITS
76506c3fb27SDimitry Andric {
76606c3fb27SDimitry Andric }
7675ffd83dbSDimitry Andric 
7685f757f3fSDimitry Andric bool AArch64PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
7695f757f3fSDimitry Andric   if (tryCombineAllImpl(MI))
7705ffd83dbSDimitry Andric     return true;
7715ffd83dbSDimitry Andric 
772fe6060f1SDimitry Andric   unsigned Opc = MI.getOpcode();
773fe6060f1SDimitry Andric   switch (Opc) {
7745ffd83dbSDimitry Andric   case TargetOpcode::G_SHUFFLE_VECTOR:
7755ffd83dbSDimitry Andric     return Helper.tryCombineShuffleVector(MI);
776349cc55cSDimitry Andric   case TargetOpcode::G_UADDO:
777349cc55cSDimitry Andric     return tryToSimplifyUADDO(MI, B, Helper, Observer);
778fe6060f1SDimitry Andric   case TargetOpcode::G_MEMCPY_INLINE:
779fe6060f1SDimitry Andric     return Helper.tryEmitMemcpyInline(MI);
780e8d8bef9SDimitry Andric   case TargetOpcode::G_MEMCPY:
781e8d8bef9SDimitry Andric   case TargetOpcode::G_MEMMOVE:
782e8d8bef9SDimitry Andric   case TargetOpcode::G_MEMSET: {
783e8d8bef9SDimitry Andric     // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
784e8d8bef9SDimitry Andric     // heuristics decide.
7855f757f3fSDimitry Andric     unsigned MaxLen = CInfo.EnableOpt ? 0 : 32;
786e8d8bef9SDimitry Andric     // Try to inline memcpy type calls if optimizations are enabled.
787fe6060f1SDimitry Andric     if (Helper.tryCombineMemCpyFamily(MI, MaxLen))
788fe6060f1SDimitry Andric       return true;
789fe6060f1SDimitry Andric     if (Opc == TargetOpcode::G_MEMSET)
7905f757f3fSDimitry Andric       return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, CInfo.EnableMinSize);
791fe6060f1SDimitry Andric     return false;
792e8d8bef9SDimitry Andric   }
7935ffd83dbSDimitry Andric   }
7945ffd83dbSDimitry Andric 
7955ffd83dbSDimitry Andric   return false;
7965ffd83dbSDimitry Andric }
7975ffd83dbSDimitry Andric 
7985ffd83dbSDimitry Andric // Pass boilerplate
7995ffd83dbSDimitry Andric // ================
8005ffd83dbSDimitry Andric 
8015ffd83dbSDimitry Andric class AArch64PreLegalizerCombiner : public MachineFunctionPass {
8025ffd83dbSDimitry Andric public:
8035ffd83dbSDimitry Andric   static char ID;
8045ffd83dbSDimitry Andric 
805fe6060f1SDimitry Andric   AArch64PreLegalizerCombiner();
8065ffd83dbSDimitry Andric 
80706c3fb27SDimitry Andric   StringRef getPassName() const override {
80806c3fb27SDimitry Andric     return "AArch64PreLegalizerCombiner";
80906c3fb27SDimitry Andric   }
8105ffd83dbSDimitry Andric 
8115ffd83dbSDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
8125ffd83dbSDimitry Andric 
8135ffd83dbSDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override;
8145f757f3fSDimitry Andric 
8155f757f3fSDimitry Andric private:
8165f757f3fSDimitry Andric   AArch64PreLegalizerCombinerImplRuleConfig RuleConfig;
8175ffd83dbSDimitry Andric };
8185ffd83dbSDimitry Andric } // end anonymous namespace
8195ffd83dbSDimitry Andric 
8205ffd83dbSDimitry Andric void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
8215ffd83dbSDimitry Andric   AU.addRequired<TargetPassConfig>();
8225ffd83dbSDimitry Andric   AU.setPreservesCFG();
8235ffd83dbSDimitry Andric   getSelectionDAGFallbackAnalysisUsage(AU);
8245ffd83dbSDimitry Andric   AU.addRequired<GISelKnownBitsAnalysis>();
8255ffd83dbSDimitry Andric   AU.addPreserved<GISelKnownBitsAnalysis>();
826*0fca6ea1SDimitry Andric   AU.addRequired<MachineDominatorTreeWrapperPass>();
827*0fca6ea1SDimitry Andric   AU.addPreserved<MachineDominatorTreeWrapperPass>();
828fe6060f1SDimitry Andric   AU.addRequired<GISelCSEAnalysisWrapperPass>();
829fe6060f1SDimitry Andric   AU.addPreserved<GISelCSEAnalysisWrapperPass>();
8305ffd83dbSDimitry Andric   MachineFunctionPass::getAnalysisUsage(AU);
8315ffd83dbSDimitry Andric }
8325ffd83dbSDimitry Andric 
833fe6060f1SDimitry Andric AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner()
834fe6060f1SDimitry Andric     : MachineFunctionPass(ID) {
8355ffd83dbSDimitry Andric   initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
8365f757f3fSDimitry Andric 
8375f757f3fSDimitry Andric   if (!RuleConfig.parseCommandLineOption())
8385f757f3fSDimitry Andric     report_fatal_error("Invalid rule identifier");
8395ffd83dbSDimitry Andric }
8405ffd83dbSDimitry Andric 
8415ffd83dbSDimitry Andric bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
8425ffd83dbSDimitry Andric   if (MF.getProperties().hasProperty(
8435ffd83dbSDimitry Andric           MachineFunctionProperties::Property::FailedISel))
8445ffd83dbSDimitry Andric     return false;
845fe6060f1SDimitry Andric   auto &TPC = getAnalysis<TargetPassConfig>();
846fe6060f1SDimitry Andric 
847fe6060f1SDimitry Andric   // Enable CSE.
848fe6060f1SDimitry Andric   GISelCSEAnalysisWrapper &Wrapper =
849fe6060f1SDimitry Andric       getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
850fe6060f1SDimitry Andric   auto *CSEInfo = &Wrapper.get(TPC.getCSEConfig());
851fe6060f1SDimitry Andric 
8525f757f3fSDimitry Andric   const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
8535f757f3fSDimitry Andric   const auto *LI = ST.getLegalizerInfo();
8545f757f3fSDimitry Andric 
8555ffd83dbSDimitry Andric   const Function &F = MF.getFunction();
8565ffd83dbSDimitry Andric   bool EnableOpt =
8575f757f3fSDimitry Andric       MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
8585ffd83dbSDimitry Andric   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
859*0fca6ea1SDimitry Andric   MachineDominatorTree *MDT =
860*0fca6ea1SDimitry Andric       &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
8615f757f3fSDimitry Andric   CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
8625f757f3fSDimitry Andric                      /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(),
8635f757f3fSDimitry Andric                      F.hasMinSize());
8645f757f3fSDimitry Andric   AArch64PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *KB, CSEInfo,
8655f757f3fSDimitry Andric                                        RuleConfig, ST, MDT, LI);
8665f757f3fSDimitry Andric   return Impl.combineMachineInstrs();
8675ffd83dbSDimitry Andric }
8685ffd83dbSDimitry Andric 
8695ffd83dbSDimitry Andric char AArch64PreLegalizerCombiner::ID = 0;
8705ffd83dbSDimitry Andric INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE,
8715ffd83dbSDimitry Andric                       "Combine AArch64 machine instrs before legalization",
8725ffd83dbSDimitry Andric                       false, false)
8735ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
8745ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
875fe6060f1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
8765ffd83dbSDimitry Andric INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
8775ffd83dbSDimitry Andric                     "Combine AArch64 machine instrs before legalization", false,
8785ffd83dbSDimitry Andric                     false)
8795ffd83dbSDimitry Andric 
8805ffd83dbSDimitry Andric namespace llvm {
881fe6060f1SDimitry Andric FunctionPass *createAArch64PreLegalizerCombiner() {
882fe6060f1SDimitry Andric   return new AArch64PreLegalizerCombiner();
8835ffd83dbSDimitry Andric }
8845ffd83dbSDimitry Andric } // end namespace llvm
885