xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp (revision 5f757f3ff9144b609b3c433dfd370cc6bdc191ad)
10b57cec5SDimitry Andric //
20b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
30b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
40b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
50b57cec5SDimitry Andric //
60b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
70b57cec5SDimitry Andric //
80b57cec5SDimitry Andric // This file contains a pass that performs optimization on SIMD instructions
90b57cec5SDimitry Andric // with high latency by splitting them into more efficient series of
100b57cec5SDimitry Andric // instructions.
110b57cec5SDimitry Andric //
120b57cec5SDimitry Andric // 1. Rewrite certain SIMD instructions with vector element due to their
130b57cec5SDimitry Andric // inefficiency on some targets.
140b57cec5SDimitry Andric //
150b57cec5SDimitry Andric // For example:
160b57cec5SDimitry Andric //    fmla v0.4s, v1.4s, v2.s[1]
170b57cec5SDimitry Andric //
180b57cec5SDimitry Andric // Is rewritten into:
190b57cec5SDimitry Andric //    dup v3.4s, v2.s[1]
200b57cec5SDimitry Andric //    fmla v0.4s, v1.4s, v3.4s
210b57cec5SDimitry Andric //
220b57cec5SDimitry Andric // 2. Rewrite interleaved memory access instructions due to their
230b57cec5SDimitry Andric // inefficiency on some targets.
240b57cec5SDimitry Andric //
250b57cec5SDimitry Andric // For example:
260b57cec5SDimitry Andric //    st2 {v0.4s, v1.4s}, addr
270b57cec5SDimitry Andric //
280b57cec5SDimitry Andric // Is rewritten into:
290b57cec5SDimitry Andric //    zip1 v2.4s, v0.4s, v1.4s
300b57cec5SDimitry Andric //    zip2 v3.4s, v0.4s, v1.4s
310b57cec5SDimitry Andric //    stp  q2, q3,  addr
320b57cec5SDimitry Andric //
330b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
340b57cec5SDimitry Andric 
350b57cec5SDimitry Andric #include "AArch64InstrInfo.h"
360b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h"
370b57cec5SDimitry Andric #include "llvm/ADT/Statistic.h"
380b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h"
390b57cec5SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h"
400b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunction.h"
410b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
420b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstr.h"
430b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h"
440b57cec5SDimitry Andric #include "llvm/CodeGen/MachineOperand.h"
450b57cec5SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h"
460b57cec5SDimitry Andric #include "llvm/CodeGen/TargetInstrInfo.h"
470b57cec5SDimitry Andric #include "llvm/CodeGen/TargetSchedule.h"
480b57cec5SDimitry Andric #include "llvm/CodeGen/TargetSubtargetInfo.h"
490b57cec5SDimitry Andric #include "llvm/MC/MCInstrDesc.h"
500b57cec5SDimitry Andric #include "llvm/MC/MCSchedule.h"
510b57cec5SDimitry Andric #include "llvm/Pass.h"
520b57cec5SDimitry Andric #include <unordered_map>
53*5f757f3fSDimitry Andric #include <map>
540b57cec5SDimitry Andric 
550b57cec5SDimitry Andric using namespace llvm;
560b57cec5SDimitry Andric 
570b57cec5SDimitry Andric #define DEBUG_TYPE "aarch64-simdinstr-opt"
580b57cec5SDimitry Andric 
590b57cec5SDimitry Andric STATISTIC(NumModifiedInstr,
600b57cec5SDimitry Andric           "Number of SIMD instructions modified");
610b57cec5SDimitry Andric 
620b57cec5SDimitry Andric #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME                                     \
630b57cec5SDimitry Andric   "AArch64 SIMD instructions optimization pass"
640b57cec5SDimitry Andric 
650b57cec5SDimitry Andric namespace {
660b57cec5SDimitry Andric 
670b57cec5SDimitry Andric struct AArch64SIMDInstrOpt : public MachineFunctionPass {
680b57cec5SDimitry Andric   static char ID;
690b57cec5SDimitry Andric 
700b57cec5SDimitry Andric   const TargetInstrInfo *TII;
710b57cec5SDimitry Andric   MachineRegisterInfo *MRI;
720b57cec5SDimitry Andric   TargetSchedModel SchedModel;
730b57cec5SDimitry Andric 
740b57cec5SDimitry Andric   // The two maps below are used to cache decisions instead of recomputing:
750b57cec5SDimitry Andric   // This is used to cache instruction replacement decisions within function
760b57cec5SDimitry Andric   // units and across function units.
770b57cec5SDimitry Andric   std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable;
780b57cec5SDimitry Andric   // This is used to cache the decision of whether to leave the interleaved
790b57cec5SDimitry Andric   // store instructions replacement pass early or not for a particular target.
800b57cec5SDimitry Andric   std::unordered_map<std::string, bool> InterlEarlyExit;
810b57cec5SDimitry Andric 
820b57cec5SDimitry Andric   typedef enum {
830b57cec5SDimitry Andric     VectorElem,
840b57cec5SDimitry Andric     Interleave
850b57cec5SDimitry Andric   } Subpass;
860b57cec5SDimitry Andric 
870b57cec5SDimitry Andric   // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
880b57cec5SDimitry Andric   struct InstReplInfo {
890b57cec5SDimitry Andric     unsigned OrigOpc;
900b57cec5SDimitry Andric 		std::vector<unsigned> ReplOpc;
910b57cec5SDimitry Andric     const TargetRegisterClass RC;
920b57cec5SDimitry Andric   };
930b57cec5SDimitry Andric 
940b57cec5SDimitry Andric #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
950b57cec5SDimitry Andric   {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
960b57cec5SDimitry Andric #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
970b57cec5SDimitry Andric                 OpcR7, OpcR8, OpcR9, RC) \
980b57cec5SDimitry Andric   {OpcOrg, \
990b57cec5SDimitry Andric    {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
1000b57cec5SDimitry Andric 
1010b57cec5SDimitry Andric   // The Instruction Replacement Table:
1020b57cec5SDimitry Andric   std::vector<InstReplInfo> IRT = {
1030b57cec5SDimitry Andric     // ST2 instructions
1040b57cec5SDimitry Andric     RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
1050b57cec5SDimitry Andric           AArch64::STPQi, AArch64::FPR128RegClass),
1060b57cec5SDimitry Andric     RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
1070b57cec5SDimitry Andric           AArch64::STPQi, AArch64::FPR128RegClass),
1080b57cec5SDimitry Andric     RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
1090b57cec5SDimitry Andric           AArch64::STPDi, AArch64::FPR64RegClass),
1100b57cec5SDimitry Andric     RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
1110b57cec5SDimitry Andric           AArch64::STPQi, AArch64::FPR128RegClass),
1120b57cec5SDimitry Andric     RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
1130b57cec5SDimitry Andric           AArch64::STPDi, AArch64::FPR64RegClass),
1140b57cec5SDimitry Andric     RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
1150b57cec5SDimitry Andric           AArch64::STPQi, AArch64::FPR128RegClass),
1160b57cec5SDimitry Andric     RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
1170b57cec5SDimitry Andric           AArch64::STPDi, AArch64::FPR64RegClass),
1180b57cec5SDimitry Andric     // ST4 instructions
1190b57cec5SDimitry Andric     RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
1200b57cec5SDimitry Andric           AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
1210b57cec5SDimitry Andric           AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
1220b57cec5SDimitry Andric           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
1230b57cec5SDimitry Andric     RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
1240b57cec5SDimitry Andric           AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
1250b57cec5SDimitry Andric           AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
1260b57cec5SDimitry Andric           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
1270b57cec5SDimitry Andric     RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
1280b57cec5SDimitry Andric           AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
1290b57cec5SDimitry Andric           AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
1300b57cec5SDimitry Andric           AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
1310b57cec5SDimitry Andric     RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
1320b57cec5SDimitry Andric           AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
1330b57cec5SDimitry Andric           AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
1340b57cec5SDimitry Andric           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
1350b57cec5SDimitry Andric     RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
1360b57cec5SDimitry Andric           AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
1370b57cec5SDimitry Andric           AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
1380b57cec5SDimitry Andric           AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
1390b57cec5SDimitry Andric     RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
1400b57cec5SDimitry Andric           AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
1410b57cec5SDimitry Andric           AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
1420b57cec5SDimitry Andric           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
1430b57cec5SDimitry Andric     RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
1440b57cec5SDimitry Andric           AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
1450b57cec5SDimitry Andric           AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
1460b57cec5SDimitry Andric           AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
1470b57cec5SDimitry Andric   };
1480b57cec5SDimitry Andric 
1490b57cec5SDimitry Andric   // A costly instruction is replaced in this work by N efficient instructions
1500b57cec5SDimitry Andric   // The maximum of N is curently 10 and it is for ST4 case.
1510b57cec5SDimitry Andric   static const unsigned MaxNumRepl = 10;
1520b57cec5SDimitry Andric 
AArch64SIMDInstrOpt__anon600ea3290111::AArch64SIMDInstrOpt1530b57cec5SDimitry Andric   AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {
1540b57cec5SDimitry Andric     initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry());
1550b57cec5SDimitry Andric   }
1560b57cec5SDimitry Andric 
1570b57cec5SDimitry Andric   /// Based only on latency of instructions, determine if it is cost efficient
1580b57cec5SDimitry Andric   /// to replace the instruction InstDesc by the instructions stored in the
1590b57cec5SDimitry Andric   /// array InstDescRepl.
1600b57cec5SDimitry Andric   /// Return true if replacement is expected to be faster.
1610b57cec5SDimitry Andric   bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
1620b57cec5SDimitry Andric                          SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
1630b57cec5SDimitry Andric 
1640b57cec5SDimitry Andric   /// Determine if we need to exit the instruction replacement optimization
1650b57cec5SDimitry Andric   /// passes early. This makes sure that no compile time is spent in this pass
1660b57cec5SDimitry Andric   /// for targets with no need for any of these optimizations.
1670b57cec5SDimitry Andric   /// Return true if early exit of the pass is recommended.
1680b57cec5SDimitry Andric   bool shouldExitEarly(MachineFunction *MF, Subpass SP);
1690b57cec5SDimitry Andric 
1700b57cec5SDimitry Andric   /// Check whether an equivalent DUP instruction has already been
1710b57cec5SDimitry Andric   /// created or not.
1720b57cec5SDimitry Andric   /// Return true when the DUP instruction already exists. In this case,
1730b57cec5SDimitry Andric   /// DestReg will point to the destination of the already created DUP.
1740b57cec5SDimitry Andric   bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
1750b57cec5SDimitry Andric                 unsigned LaneNumber, unsigned *DestReg) const;
1760b57cec5SDimitry Andric 
1770b57cec5SDimitry Andric   /// Certain SIMD instructions with vector element operand are not efficient.
1780b57cec5SDimitry Andric   /// Rewrite them into SIMD instructions with vector operands. This rewrite
1790b57cec5SDimitry Andric   /// is driven by the latency of the instructions.
1800b57cec5SDimitry Andric   /// Return true if the SIMD instruction is modified.
1810b57cec5SDimitry Andric   bool optimizeVectElement(MachineInstr &MI);
1820b57cec5SDimitry Andric 
1830b57cec5SDimitry Andric   /// Process The REG_SEQUENCE instruction, and extract the source
1840b57cec5SDimitry Andric   /// operands of the ST2/4 instruction from it.
1850b57cec5SDimitry Andric   /// Example of such instructions.
1860b57cec5SDimitry Andric   ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
1870b57cec5SDimitry Andric   /// Return true when the instruction is processed successfully.
1880b57cec5SDimitry Andric   bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg,
1890b57cec5SDimitry Andric                          unsigned* StRegKill, unsigned NumArg) const;
1900b57cec5SDimitry Andric 
1910b57cec5SDimitry Andric   /// Load/Store Interleaving instructions are not always beneficial.
1920b57cec5SDimitry Andric   /// Replace them by ZIP instructionand classical load/store.
1930b57cec5SDimitry Andric   /// Return true if the SIMD instruction is modified.
1940b57cec5SDimitry Andric   bool optimizeLdStInterleave(MachineInstr &MI);
1950b57cec5SDimitry Andric 
1960b57cec5SDimitry Andric   /// Return the number of useful source registers for this
1970b57cec5SDimitry Andric   /// instruction (2 for ST2 and 4 for ST4).
1980b57cec5SDimitry Andric   unsigned determineSrcReg(MachineInstr &MI) const;
1990b57cec5SDimitry Andric 
2000b57cec5SDimitry Andric   bool runOnMachineFunction(MachineFunction &Fn) override;
2010b57cec5SDimitry Andric 
getPassName__anon600ea3290111::AArch64SIMDInstrOpt2020b57cec5SDimitry Andric   StringRef getPassName() const override {
2030b57cec5SDimitry Andric     return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
2040b57cec5SDimitry Andric   }
2050b57cec5SDimitry Andric };
2060b57cec5SDimitry Andric 
2070b57cec5SDimitry Andric char AArch64SIMDInstrOpt::ID = 0;
2080b57cec5SDimitry Andric 
2090b57cec5SDimitry Andric } // end anonymous namespace
2100b57cec5SDimitry Andric 
2110b57cec5SDimitry Andric INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt",
2120b57cec5SDimitry Andric                 AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
2130b57cec5SDimitry Andric 
2140b57cec5SDimitry Andric /// Based only on latency of instructions, determine if it is cost efficient
2150b57cec5SDimitry Andric /// to replace the instruction InstDesc by the instructions stored in the
2160b57cec5SDimitry Andric /// array InstDescRepl.
2170b57cec5SDimitry Andric /// Return true if replacement is expected to be faster.
2180b57cec5SDimitry Andric bool AArch64SIMDInstrOpt::
shouldReplaceInst(MachineFunction * MF,const MCInstrDesc * InstDesc,SmallVectorImpl<const MCInstrDesc * > & InstDescRepl)2190b57cec5SDimitry Andric shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
2200b57cec5SDimitry Andric                   SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) {
2210b57cec5SDimitry Andric   // Check if replacement decision is already available in the cached table.
2220b57cec5SDimitry Andric   // if so, return it.
2235ffd83dbSDimitry Andric   std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
2240b57cec5SDimitry Andric   auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
225e8d8bef9SDimitry Andric   auto It = SIMDInstrTable.find(InstID);
226e8d8bef9SDimitry Andric   if (It != SIMDInstrTable.end())
227e8d8bef9SDimitry Andric     return It->second;
2280b57cec5SDimitry Andric 
2290b57cec5SDimitry Andric   unsigned SCIdx = InstDesc->getSchedClass();
2300b57cec5SDimitry Andric   const MCSchedClassDesc *SCDesc =
2310b57cec5SDimitry Andric     SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
2320b57cec5SDimitry Andric 
2330b57cec5SDimitry Andric   // If a target does not define resources for the instructions
2340b57cec5SDimitry Andric   // of interest, then return false for no replacement.
2350b57cec5SDimitry Andric   const MCSchedClassDesc *SCDescRepl;
2360b57cec5SDimitry Andric   if (!SCDesc->isValid() || SCDesc->isVariant())
2370b57cec5SDimitry Andric   {
2380b57cec5SDimitry Andric     SIMDInstrTable[InstID] = false;
2390b57cec5SDimitry Andric     return false;
2400b57cec5SDimitry Andric   }
241bdd1243dSDimitry Andric   for (const auto *IDesc : InstDescRepl)
2420b57cec5SDimitry Andric   {
2430b57cec5SDimitry Andric     SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
2440b57cec5SDimitry Andric       IDesc->getSchedClass());
2450b57cec5SDimitry Andric     if (!SCDescRepl->isValid() || SCDescRepl->isVariant())
2460b57cec5SDimitry Andric     {
2470b57cec5SDimitry Andric       SIMDInstrTable[InstID] = false;
2480b57cec5SDimitry Andric       return false;
2490b57cec5SDimitry Andric     }
2500b57cec5SDimitry Andric   }
2510b57cec5SDimitry Andric 
2520b57cec5SDimitry Andric   // Replacement cost.
2530b57cec5SDimitry Andric   unsigned ReplCost = 0;
254bdd1243dSDimitry Andric   for (const auto *IDesc :InstDescRepl)
2550b57cec5SDimitry Andric     ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
2560b57cec5SDimitry Andric 
2570b57cec5SDimitry Andric   if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
2580b57cec5SDimitry Andric   {
2590b57cec5SDimitry Andric     SIMDInstrTable[InstID] = true;
2600b57cec5SDimitry Andric     return true;
2610b57cec5SDimitry Andric   }
2620b57cec5SDimitry Andric   else
2630b57cec5SDimitry Andric   {
2640b57cec5SDimitry Andric     SIMDInstrTable[InstID] = false;
2650b57cec5SDimitry Andric     return false;
2660b57cec5SDimitry Andric   }
2670b57cec5SDimitry Andric }
2680b57cec5SDimitry Andric 
2690b57cec5SDimitry Andric /// Determine if we need to exit this pass for a kind of instruction replacement
2700b57cec5SDimitry Andric /// early. This makes sure that no compile time is spent in this pass for
2710b57cec5SDimitry Andric /// targets with no need for any of these optimizations beyond performing this
2720b57cec5SDimitry Andric /// check.
2730b57cec5SDimitry Andric /// Return true if early exit of this pass for a kind of instruction
2740b57cec5SDimitry Andric /// replacement is recommended for a target.
shouldExitEarly(MachineFunction * MF,Subpass SP)2750b57cec5SDimitry Andric bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
2760b57cec5SDimitry Andric   const MCInstrDesc* OriginalMCID;
2770b57cec5SDimitry Andric   SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
2780b57cec5SDimitry Andric 
2790b57cec5SDimitry Andric   switch (SP) {
2800b57cec5SDimitry Andric   // For this optimization, check by comparing the latency of a representative
2810b57cec5SDimitry Andric   // instruction to that of the replacement instructions.
2820b57cec5SDimitry Andric   // TODO: check for all concerned instructions.
2830b57cec5SDimitry Andric   case VectorElem:
2840b57cec5SDimitry Andric     OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed);
2850b57cec5SDimitry Andric     ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane));
2860b57cec5SDimitry Andric     ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32));
2870b57cec5SDimitry Andric     if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
2880b57cec5SDimitry Andric       return false;
2890b57cec5SDimitry Andric     break;
2900b57cec5SDimitry Andric 
2910b57cec5SDimitry Andric   // For this optimization, check for all concerned instructions.
2920b57cec5SDimitry Andric   case Interleave:
2935ffd83dbSDimitry Andric     std::string Subtarget =
2945ffd83dbSDimitry Andric         std::string(SchedModel.getSubtargetInfo()->getCPU());
295e8d8bef9SDimitry Andric     auto It = InterlEarlyExit.find(Subtarget);
296e8d8bef9SDimitry Andric     if (It != InterlEarlyExit.end())
297e8d8bef9SDimitry Andric       return It->second;
2980b57cec5SDimitry Andric 
2990b57cec5SDimitry Andric     for (auto &I : IRT) {
3000b57cec5SDimitry Andric       OriginalMCID = &TII->get(I.OrigOpc);
3010b57cec5SDimitry Andric       for (auto &Repl : I.ReplOpc)
3020b57cec5SDimitry Andric         ReplInstrMCID.push_back(&TII->get(Repl));
3030b57cec5SDimitry Andric       if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
3040b57cec5SDimitry Andric         InterlEarlyExit[Subtarget] = false;
3050b57cec5SDimitry Andric         return false;
3060b57cec5SDimitry Andric       }
3070b57cec5SDimitry Andric       ReplInstrMCID.clear();
3080b57cec5SDimitry Andric     }
3090b57cec5SDimitry Andric     InterlEarlyExit[Subtarget] = true;
3100b57cec5SDimitry Andric     break;
3110b57cec5SDimitry Andric   }
3120b57cec5SDimitry Andric 
3130b57cec5SDimitry Andric   return true;
3140b57cec5SDimitry Andric }
3150b57cec5SDimitry Andric 
3160b57cec5SDimitry Andric /// Check whether an equivalent DUP instruction has already been
3170b57cec5SDimitry Andric /// created or not.
3180b57cec5SDimitry Andric /// Return true when the DUP instruction already exists. In this case,
3190b57cec5SDimitry Andric /// DestReg will point to the destination of the already created DUP.
reuseDUP(MachineInstr & MI,unsigned DupOpcode,unsigned SrcReg,unsigned LaneNumber,unsigned * DestReg) const3200b57cec5SDimitry Andric bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
3210b57cec5SDimitry Andric                                          unsigned SrcReg, unsigned LaneNumber,
3220b57cec5SDimitry Andric                                          unsigned *DestReg) const {
3230b57cec5SDimitry Andric   for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
3240b57cec5SDimitry Andric        MII != MIE;) {
3250b57cec5SDimitry Andric     MII--;
3260b57cec5SDimitry Andric     MachineInstr *CurrentMI = &*MII;
3270b57cec5SDimitry Andric 
3280b57cec5SDimitry Andric     if (CurrentMI->getOpcode() == DupOpcode &&
3290b57cec5SDimitry Andric         CurrentMI->getNumOperands() == 3 &&
3300b57cec5SDimitry Andric         CurrentMI->getOperand(1).getReg() == SrcReg &&
3310b57cec5SDimitry Andric         CurrentMI->getOperand(2).getImm() == LaneNumber) {
3320b57cec5SDimitry Andric       *DestReg = CurrentMI->getOperand(0).getReg();
3330b57cec5SDimitry Andric       return true;
3340b57cec5SDimitry Andric     }
3350b57cec5SDimitry Andric   }
3360b57cec5SDimitry Andric 
3370b57cec5SDimitry Andric   return false;
3380b57cec5SDimitry Andric }
3390b57cec5SDimitry Andric 
3400b57cec5SDimitry Andric /// Certain SIMD instructions with vector element operand are not efficient.
3410b57cec5SDimitry Andric /// Rewrite them into SIMD instructions with vector operands. This rewrite
3420b57cec5SDimitry Andric /// is driven by the latency of the instructions.
3430b57cec5SDimitry Andric /// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
3440b57cec5SDimitry Andric /// and FMULX and hence they are hardcoded.
3450b57cec5SDimitry Andric ///
3460b57cec5SDimitry Andric /// For example:
3470b57cec5SDimitry Andric ///    fmla v0.4s, v1.4s, v2.s[1]
3480b57cec5SDimitry Andric ///
3490b57cec5SDimitry Andric /// Is rewritten into
3500b57cec5SDimitry Andric ///    dup  v3.4s, v2.s[1]      // DUP not necessary if redundant
3510b57cec5SDimitry Andric ///    fmla v0.4s, v1.4s, v3.4s
3520b57cec5SDimitry Andric ///
3530b57cec5SDimitry Andric /// Return true if the SIMD instruction is modified.
optimizeVectElement(MachineInstr & MI)3540b57cec5SDimitry Andric bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
3550b57cec5SDimitry Andric   const MCInstrDesc *MulMCID, *DupMCID;
3560b57cec5SDimitry Andric   const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
3570b57cec5SDimitry Andric 
3580b57cec5SDimitry Andric   switch (MI.getOpcode()) {
3590b57cec5SDimitry Andric   default:
3600b57cec5SDimitry Andric     return false;
3610b57cec5SDimitry Andric 
3620b57cec5SDimitry Andric   // 4X32 instructions
3630b57cec5SDimitry Andric   case AArch64::FMLAv4i32_indexed:
3640b57cec5SDimitry Andric     DupMCID = &TII->get(AArch64::DUPv4i32lane);
3650b57cec5SDimitry Andric     MulMCID = &TII->get(AArch64::FMLAv4f32);
3660b57cec5SDimitry Andric     break;
3670b57cec5SDimitry Andric   case AArch64::FMLSv4i32_indexed:
3680b57cec5SDimitry Andric     DupMCID = &TII->get(AArch64::DUPv4i32lane);
3690b57cec5SDimitry Andric     MulMCID = &TII->get(AArch64::FMLSv4f32);
3700b57cec5SDimitry Andric     break;
3710b57cec5SDimitry Andric   case AArch64::FMULXv4i32_indexed:
3720b57cec5SDimitry Andric     DupMCID = &TII->get(AArch64::DUPv4i32lane);
3730b57cec5SDimitry Andric     MulMCID = &TII->get(AArch64::FMULXv4f32);
3740b57cec5SDimitry Andric     break;
3750b57cec5SDimitry Andric   case AArch64::FMULv4i32_indexed:
3760b57cec5SDimitry Andric     DupMCID = &TII->get(AArch64::DUPv4i32lane);
3770b57cec5SDimitry Andric     MulMCID = &TII->get(AArch64::FMULv4f32);
3780b57cec5SDimitry Andric     break;
3790b57cec5SDimitry Andric 
3800b57cec5SDimitry Andric   // 2X64 instructions
3810b57cec5SDimitry Andric   case AArch64::FMLAv2i64_indexed:
3820b57cec5SDimitry Andric     DupMCID = &TII->get(AArch64::DUPv2i64lane);
3830b57cec5SDimitry Andric     MulMCID = &TII->get(AArch64::FMLAv2f64);
3840b57cec5SDimitry Andric     break;
3850b57cec5SDimitry Andric   case AArch64::FMLSv2i64_indexed:
3860b57cec5SDimitry Andric     DupMCID = &TII->get(AArch64::DUPv2i64lane);
3870b57cec5SDimitry Andric     MulMCID = &TII->get(AArch64::FMLSv2f64);
3880b57cec5SDimitry Andric     break;
3890b57cec5SDimitry Andric   case AArch64::FMULXv2i64_indexed:
3900b57cec5SDimitry Andric     DupMCID = &TII->get(AArch64::DUPv2i64lane);
3910b57cec5SDimitry Andric     MulMCID = &TII->get(AArch64::FMULXv2f64);
3920b57cec5SDimitry Andric     break;
3930b57cec5SDimitry Andric   case AArch64::FMULv2i64_indexed:
3940b57cec5SDimitry Andric     DupMCID = &TII->get(AArch64::DUPv2i64lane);
3950b57cec5SDimitry Andric     MulMCID = &TII->get(AArch64::FMULv2f64);
3960b57cec5SDimitry Andric     break;
3970b57cec5SDimitry Andric 
3980b57cec5SDimitry Andric   // 2X32 instructions
3990b57cec5SDimitry Andric   case AArch64::FMLAv2i32_indexed:
4000b57cec5SDimitry Andric     RC = &AArch64::FPR64RegClass;
4010b57cec5SDimitry Andric     DupMCID = &TII->get(AArch64::DUPv2i32lane);
4020b57cec5SDimitry Andric     MulMCID = &TII->get(AArch64::FMLAv2f32);
4030b57cec5SDimitry Andric     break;
4040b57cec5SDimitry Andric   case AArch64::FMLSv2i32_indexed:
4050b57cec5SDimitry Andric     RC = &AArch64::FPR64RegClass;
4060b57cec5SDimitry Andric     DupMCID = &TII->get(AArch64::DUPv2i32lane);
4070b57cec5SDimitry Andric     MulMCID = &TII->get(AArch64::FMLSv2f32);
4080b57cec5SDimitry Andric     break;
4090b57cec5SDimitry Andric   case AArch64::FMULXv2i32_indexed:
4100b57cec5SDimitry Andric     RC = &AArch64::FPR64RegClass;
4110b57cec5SDimitry Andric     DupMCID = &TII->get(AArch64::DUPv2i32lane);
4120b57cec5SDimitry Andric     MulMCID = &TII->get(AArch64::FMULXv2f32);
4130b57cec5SDimitry Andric     break;
4140b57cec5SDimitry Andric   case AArch64::FMULv2i32_indexed:
4150b57cec5SDimitry Andric     RC = &AArch64::FPR64RegClass;
4160b57cec5SDimitry Andric     DupMCID = &TII->get(AArch64::DUPv2i32lane);
4170b57cec5SDimitry Andric     MulMCID = &TII->get(AArch64::FMULv2f32);
4180b57cec5SDimitry Andric     break;
4190b57cec5SDimitry Andric   }
4200b57cec5SDimitry Andric 
4210b57cec5SDimitry Andric   SmallVector<const MCInstrDesc*, 2> ReplInstrMCID;
4220b57cec5SDimitry Andric   ReplInstrMCID.push_back(DupMCID);
4230b57cec5SDimitry Andric   ReplInstrMCID.push_back(MulMCID);
4240b57cec5SDimitry Andric   if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
4250b57cec5SDimitry Andric                          ReplInstrMCID))
4260b57cec5SDimitry Andric     return false;
4270b57cec5SDimitry Andric 
4280b57cec5SDimitry Andric   const DebugLoc &DL = MI.getDebugLoc();
4290b57cec5SDimitry Andric   MachineBasicBlock &MBB = *MI.getParent();
4300b57cec5SDimitry Andric   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4310b57cec5SDimitry Andric 
4320b57cec5SDimitry Andric   // Get the operands of the current SIMD arithmetic instruction.
4338bcb0991SDimitry Andric   Register MulDest = MI.getOperand(0).getReg();
4348bcb0991SDimitry Andric   Register SrcReg0 = MI.getOperand(1).getReg();
4350b57cec5SDimitry Andric   unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
4368bcb0991SDimitry Andric   Register SrcReg1 = MI.getOperand(2).getReg();
4370b57cec5SDimitry Andric   unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
4380b57cec5SDimitry Andric   unsigned DupDest;
4390b57cec5SDimitry Andric 
4400b57cec5SDimitry Andric   // Instructions of interest have either 4 or 5 operands.
4410b57cec5SDimitry Andric   if (MI.getNumOperands() == 5) {
4428bcb0991SDimitry Andric     Register SrcReg2 = MI.getOperand(3).getReg();
4430b57cec5SDimitry Andric     unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
4440b57cec5SDimitry Andric     unsigned LaneNumber = MI.getOperand(4).getImm();
4450b57cec5SDimitry Andric     // Create a new DUP instruction. Note that if an equivalent DUP instruction
4460b57cec5SDimitry Andric     // has already been created before, then use that one instead of creating
4470b57cec5SDimitry Andric     // a new one.
4480b57cec5SDimitry Andric     if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
4490b57cec5SDimitry Andric       DupDest = MRI.createVirtualRegister(RC);
4500b57cec5SDimitry Andric       BuildMI(MBB, MI, DL, *DupMCID, DupDest)
4510b57cec5SDimitry Andric           .addReg(SrcReg2, Src2IsKill)
4520b57cec5SDimitry Andric           .addImm(LaneNumber);
4530b57cec5SDimitry Andric     }
4540b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *MulMCID, MulDest)
4550b57cec5SDimitry Andric         .addReg(SrcReg0, Src0IsKill)
4560b57cec5SDimitry Andric         .addReg(SrcReg1, Src1IsKill)
4570b57cec5SDimitry Andric         .addReg(DupDest, Src2IsKill);
4580b57cec5SDimitry Andric   } else if (MI.getNumOperands() == 4) {
4590b57cec5SDimitry Andric     unsigned LaneNumber = MI.getOperand(3).getImm();
4600b57cec5SDimitry Andric     if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
4610b57cec5SDimitry Andric       DupDest = MRI.createVirtualRegister(RC);
4620b57cec5SDimitry Andric       BuildMI(MBB, MI, DL, *DupMCID, DupDest)
4630b57cec5SDimitry Andric           .addReg(SrcReg1, Src1IsKill)
4640b57cec5SDimitry Andric           .addImm(LaneNumber);
4650b57cec5SDimitry Andric     }
4660b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *MulMCID, MulDest)
4670b57cec5SDimitry Andric         .addReg(SrcReg0, Src0IsKill)
4680b57cec5SDimitry Andric         .addReg(DupDest, Src1IsKill);
4690b57cec5SDimitry Andric   } else {
4700b57cec5SDimitry Andric     return false;
4710b57cec5SDimitry Andric   }
4720b57cec5SDimitry Andric 
4730b57cec5SDimitry Andric   ++NumModifiedInstr;
4740b57cec5SDimitry Andric   return true;
4750b57cec5SDimitry Andric }
4760b57cec5SDimitry Andric 
4770b57cec5SDimitry Andric /// Load/Store Interleaving instructions are not always beneficial.
4780b57cec5SDimitry Andric /// Replace them by ZIP instructions and classical load/store.
4790b57cec5SDimitry Andric ///
4800b57cec5SDimitry Andric /// For example:
4810b57cec5SDimitry Andric ///    st2 {v0.4s, v1.4s}, addr
4820b57cec5SDimitry Andric ///
4830b57cec5SDimitry Andric /// Is rewritten into:
4840b57cec5SDimitry Andric ///    zip1 v2.4s, v0.4s, v1.4s
4850b57cec5SDimitry Andric ///    zip2 v3.4s, v0.4s, v1.4s
4860b57cec5SDimitry Andric ///    stp  q2, q3, addr
4870b57cec5SDimitry Andric //
4880b57cec5SDimitry Andric /// For example:
4890b57cec5SDimitry Andric ///    st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
4900b57cec5SDimitry Andric ///
4910b57cec5SDimitry Andric /// Is rewritten into:
4920b57cec5SDimitry Andric ///    zip1 v4.4s, v0.4s, v2.4s
4930b57cec5SDimitry Andric ///    zip2 v5.4s, v0.4s, v2.4s
4940b57cec5SDimitry Andric ///    zip1 v6.4s, v1.4s, v3.4s
4950b57cec5SDimitry Andric ///    zip2 v7.4s, v1.4s, v3.4s
4960b57cec5SDimitry Andric ///    zip1 v8.4s, v4.4s, v6.4s
4970b57cec5SDimitry Andric ///    zip2 v9.4s, v4.4s, v6.4s
4980b57cec5SDimitry Andric ///    zip1 v10.4s, v5.4s, v7.4s
4990b57cec5SDimitry Andric ///    zip2 v11.4s, v5.4s, v7.4s
5000b57cec5SDimitry Andric ///    stp  q8, q9, addr
5010b57cec5SDimitry Andric ///    stp  q10, q11, addr+32
5020b57cec5SDimitry Andric ///
5030b57cec5SDimitry Andric /// Currently only instructions related to ST2 and ST4 are considered.
5040b57cec5SDimitry Andric /// Other may be added later.
5050b57cec5SDimitry Andric /// Return true if the SIMD instruction is modified.
optimizeLdStInterleave(MachineInstr & MI)5060b57cec5SDimitry Andric bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) {
5070b57cec5SDimitry Andric 
5080b57cec5SDimitry Andric   unsigned SeqReg, AddrReg;
5090b57cec5SDimitry Andric   unsigned StReg[4], StRegKill[4];
5100b57cec5SDimitry Andric   MachineInstr *DefiningMI;
5110b57cec5SDimitry Andric   const DebugLoc &DL = MI.getDebugLoc();
5120b57cec5SDimitry Andric   MachineBasicBlock &MBB = *MI.getParent();
5130b57cec5SDimitry Andric   SmallVector<unsigned, MaxNumRepl> ZipDest;
5140b57cec5SDimitry Andric   SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
5150b57cec5SDimitry Andric 
5160b57cec5SDimitry Andric   // If current instruction matches any of the rewriting rules, then
5170b57cec5SDimitry Andric   // gather information about parameters of the new instructions.
5180b57cec5SDimitry Andric   bool Match = false;
5190b57cec5SDimitry Andric   for (auto &I : IRT) {
5200b57cec5SDimitry Andric     if (MI.getOpcode() == I.OrigOpc) {
5210b57cec5SDimitry Andric       SeqReg  = MI.getOperand(0).getReg();
5220b57cec5SDimitry Andric       AddrReg = MI.getOperand(1).getReg();
5230b57cec5SDimitry Andric       DefiningMI = MRI->getUniqueVRegDef(SeqReg);
5240b57cec5SDimitry Andric       unsigned NumReg = determineSrcReg(MI);
5250b57cec5SDimitry Andric       if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
5260b57cec5SDimitry Andric         return false;
5270b57cec5SDimitry Andric 
5280b57cec5SDimitry Andric       for (auto &Repl : I.ReplOpc) {
5290b57cec5SDimitry Andric         ReplInstrMCID.push_back(&TII->get(Repl));
5300b57cec5SDimitry Andric         // Generate destination registers but only for non-store instruction.
5310b57cec5SDimitry Andric         if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
5320b57cec5SDimitry Andric           ZipDest.push_back(MRI->createVirtualRegister(&I.RC));
5330b57cec5SDimitry Andric       }
5340b57cec5SDimitry Andric       Match = true;
5350b57cec5SDimitry Andric       break;
5360b57cec5SDimitry Andric     }
5370b57cec5SDimitry Andric   }
5380b57cec5SDimitry Andric 
5390b57cec5SDimitry Andric   if (!Match)
5400b57cec5SDimitry Andric     return false;
5410b57cec5SDimitry Andric 
5420b57cec5SDimitry Andric   // Determine if it is profitable to replace MI by the series of instructions
5430b57cec5SDimitry Andric   // represented in ReplInstrMCID.
5440b57cec5SDimitry Andric   if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
5450b57cec5SDimitry Andric                          ReplInstrMCID))
5460b57cec5SDimitry Andric     return false;
5470b57cec5SDimitry Andric 
5480b57cec5SDimitry Andric   // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
5490b57cec5SDimitry Andric   // this point, the code generation is hardcoded and does not rely on the IRT
5500b57cec5SDimitry Andric   // table used above given that code generation for ST2 replacement is somewhat
5510b57cec5SDimitry Andric   // different than for ST4 replacement. We could have added more info into the
5520b57cec5SDimitry Andric   // table related to how we build new instructions but we may be adding more
5530b57cec5SDimitry Andric   // complexity with that).
5540b57cec5SDimitry Andric   switch (MI.getOpcode()) {
5550b57cec5SDimitry Andric   default:
5560b57cec5SDimitry Andric     return false;
5570b57cec5SDimitry Andric 
5580b57cec5SDimitry Andric   case AArch64::ST2Twov16b:
5590b57cec5SDimitry Andric   case AArch64::ST2Twov8b:
5600b57cec5SDimitry Andric   case AArch64::ST2Twov8h:
5610b57cec5SDimitry Andric   case AArch64::ST2Twov4h:
5620b57cec5SDimitry Andric   case AArch64::ST2Twov4s:
5630b57cec5SDimitry Andric   case AArch64::ST2Twov2s:
5640b57cec5SDimitry Andric   case AArch64::ST2Twov2d:
5650b57cec5SDimitry Andric     // ZIP instructions
5660b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
5670b57cec5SDimitry Andric         .addReg(StReg[0])
5680b57cec5SDimitry Andric         .addReg(StReg[1]);
5690b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
5700b57cec5SDimitry Andric         .addReg(StReg[0], StRegKill[0])
5710b57cec5SDimitry Andric         .addReg(StReg[1], StRegKill[1]);
5720b57cec5SDimitry Andric     // STP instructions
5730b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *ReplInstrMCID[2])
5740b57cec5SDimitry Andric         .addReg(ZipDest[0])
5750b57cec5SDimitry Andric         .addReg(ZipDest[1])
5760b57cec5SDimitry Andric         .addReg(AddrReg)
5770b57cec5SDimitry Andric         .addImm(0);
5780b57cec5SDimitry Andric     break;
5790b57cec5SDimitry Andric 
5800b57cec5SDimitry Andric   case AArch64::ST4Fourv16b:
5810b57cec5SDimitry Andric   case AArch64::ST4Fourv8b:
5820b57cec5SDimitry Andric   case AArch64::ST4Fourv8h:
5830b57cec5SDimitry Andric   case AArch64::ST4Fourv4h:
5840b57cec5SDimitry Andric   case AArch64::ST4Fourv4s:
5850b57cec5SDimitry Andric   case AArch64::ST4Fourv2s:
5860b57cec5SDimitry Andric   case AArch64::ST4Fourv2d:
5870b57cec5SDimitry Andric     // ZIP instructions
5880b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
5890b57cec5SDimitry Andric         .addReg(StReg[0])
5900b57cec5SDimitry Andric         .addReg(StReg[2]);
5910b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
5920b57cec5SDimitry Andric         .addReg(StReg[0], StRegKill[0])
5930b57cec5SDimitry Andric         .addReg(StReg[2], StRegKill[2]);
5940b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2])
5950b57cec5SDimitry Andric         .addReg(StReg[1])
5960b57cec5SDimitry Andric         .addReg(StReg[3]);
5970b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3])
5980b57cec5SDimitry Andric         .addReg(StReg[1], StRegKill[1])
5990b57cec5SDimitry Andric         .addReg(StReg[3], StRegKill[3]);
6000b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4])
6010b57cec5SDimitry Andric         .addReg(ZipDest[0])
6020b57cec5SDimitry Andric         .addReg(ZipDest[2]);
6030b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5])
6040b57cec5SDimitry Andric         .addReg(ZipDest[0])
6050b57cec5SDimitry Andric         .addReg(ZipDest[2]);
6060b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6])
6070b57cec5SDimitry Andric         .addReg(ZipDest[1])
6080b57cec5SDimitry Andric         .addReg(ZipDest[3]);
6090b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7])
6100b57cec5SDimitry Andric         .addReg(ZipDest[1])
6110b57cec5SDimitry Andric         .addReg(ZipDest[3]);
6120b57cec5SDimitry Andric     // stp instructions
6130b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *ReplInstrMCID[8])
6140b57cec5SDimitry Andric         .addReg(ZipDest[4])
6150b57cec5SDimitry Andric         .addReg(ZipDest[5])
6160b57cec5SDimitry Andric         .addReg(AddrReg)
6170b57cec5SDimitry Andric         .addImm(0);
6180b57cec5SDimitry Andric     BuildMI(MBB, MI, DL, *ReplInstrMCID[9])
6190b57cec5SDimitry Andric         .addReg(ZipDest[6])
6200b57cec5SDimitry Andric         .addReg(ZipDest[7])
6210b57cec5SDimitry Andric         .addReg(AddrReg)
6220b57cec5SDimitry Andric         .addImm(2);
6230b57cec5SDimitry Andric     break;
6240b57cec5SDimitry Andric   }
6250b57cec5SDimitry Andric 
6260b57cec5SDimitry Andric   ++NumModifiedInstr;
6270b57cec5SDimitry Andric   return true;
6280b57cec5SDimitry Andric }
6290b57cec5SDimitry Andric 
6300b57cec5SDimitry Andric /// Process The REG_SEQUENCE instruction, and extract the source
6310b57cec5SDimitry Andric /// operands of the ST2/4 instruction from it.
6320b57cec5SDimitry Andric /// Example of such instruction.
6330b57cec5SDimitry Andric ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
6340b57cec5SDimitry Andric /// Return true when the instruction is processed successfully.
processSeqRegInst(MachineInstr * DefiningMI,unsigned * StReg,unsigned * StRegKill,unsigned NumArg) const6350b57cec5SDimitry Andric bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
6360b57cec5SDimitry Andric      unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const {
63704eeddc0SDimitry Andric   assert(DefiningMI != nullptr);
6380b57cec5SDimitry Andric   if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
6390b57cec5SDimitry Andric     return false;
6400b57cec5SDimitry Andric 
6410b57cec5SDimitry Andric   for (unsigned i=0; i<NumArg; i++) {
6420b57cec5SDimitry Andric     StReg[i]     = DefiningMI->getOperand(2*i+1).getReg();
6430b57cec5SDimitry Andric     StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill());
6440b57cec5SDimitry Andric 
645349cc55cSDimitry Andric     // Validation check for the other arguments.
6460b57cec5SDimitry Andric     if (DefiningMI->getOperand(2*i+2).isImm()) {
6470b57cec5SDimitry Andric       switch (DefiningMI->getOperand(2*i+2).getImm()) {
6480b57cec5SDimitry Andric       default:
6490b57cec5SDimitry Andric         return false;
6500b57cec5SDimitry Andric 
6510b57cec5SDimitry Andric       case AArch64::dsub0:
6520b57cec5SDimitry Andric       case AArch64::dsub1:
6530b57cec5SDimitry Andric       case AArch64::dsub2:
6540b57cec5SDimitry Andric       case AArch64::dsub3:
6550b57cec5SDimitry Andric       case AArch64::qsub0:
6560b57cec5SDimitry Andric       case AArch64::qsub1:
6570b57cec5SDimitry Andric       case AArch64::qsub2:
6580b57cec5SDimitry Andric       case AArch64::qsub3:
6590b57cec5SDimitry Andric         break;
6600b57cec5SDimitry Andric       }
6610b57cec5SDimitry Andric     }
6620b57cec5SDimitry Andric     else
6630b57cec5SDimitry Andric       return false;
6640b57cec5SDimitry Andric   }
6650b57cec5SDimitry Andric   return true;
6660b57cec5SDimitry Andric }
6670b57cec5SDimitry Andric 
6680b57cec5SDimitry Andric /// Return the number of useful source registers for this instruction
6690b57cec5SDimitry Andric /// (2 for ST2 and 4 for ST4).
determineSrcReg(MachineInstr & MI) const6700b57cec5SDimitry Andric unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const {
6710b57cec5SDimitry Andric   switch (MI.getOpcode()) {
6720b57cec5SDimitry Andric   default:
6730b57cec5SDimitry Andric     llvm_unreachable("Unsupported instruction for this pass");
6740b57cec5SDimitry Andric 
6750b57cec5SDimitry Andric   case AArch64::ST2Twov16b:
6760b57cec5SDimitry Andric   case AArch64::ST2Twov8b:
6770b57cec5SDimitry Andric   case AArch64::ST2Twov8h:
6780b57cec5SDimitry Andric   case AArch64::ST2Twov4h:
6790b57cec5SDimitry Andric   case AArch64::ST2Twov4s:
6800b57cec5SDimitry Andric   case AArch64::ST2Twov2s:
6810b57cec5SDimitry Andric   case AArch64::ST2Twov2d:
6820b57cec5SDimitry Andric     return 2;
6830b57cec5SDimitry Andric 
6840b57cec5SDimitry Andric   case AArch64::ST4Fourv16b:
6850b57cec5SDimitry Andric   case AArch64::ST4Fourv8b:
6860b57cec5SDimitry Andric   case AArch64::ST4Fourv8h:
6870b57cec5SDimitry Andric   case AArch64::ST4Fourv4h:
6880b57cec5SDimitry Andric   case AArch64::ST4Fourv4s:
6890b57cec5SDimitry Andric   case AArch64::ST4Fourv2s:
6900b57cec5SDimitry Andric   case AArch64::ST4Fourv2d:
6910b57cec5SDimitry Andric     return 4;
6920b57cec5SDimitry Andric   }
6930b57cec5SDimitry Andric }
6940b57cec5SDimitry Andric 
runOnMachineFunction(MachineFunction & MF)6950b57cec5SDimitry Andric bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
6960b57cec5SDimitry Andric   if (skipFunction(MF.getFunction()))
6970b57cec5SDimitry Andric     return false;
6980b57cec5SDimitry Andric 
6990b57cec5SDimitry Andric   TII = MF.getSubtarget().getInstrInfo();
7000b57cec5SDimitry Andric   MRI = &MF.getRegInfo();
7010b57cec5SDimitry Andric   const TargetSubtargetInfo &ST = MF.getSubtarget();
7020b57cec5SDimitry Andric   const AArch64InstrInfo *AAII =
7030b57cec5SDimitry Andric       static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
7040b57cec5SDimitry Andric   if (!AAII)
7050b57cec5SDimitry Andric     return false;
7060b57cec5SDimitry Andric   SchedModel.init(&ST);
7070b57cec5SDimitry Andric   if (!SchedModel.hasInstrSchedModel())
7080b57cec5SDimitry Andric     return false;
7090b57cec5SDimitry Andric 
7100b57cec5SDimitry Andric   bool Changed = false;
7110b57cec5SDimitry Andric   for (auto OptimizationKind : {VectorElem, Interleave}) {
7120b57cec5SDimitry Andric     if (!shouldExitEarly(&MF, OptimizationKind)) {
7130b57cec5SDimitry Andric       SmallVector<MachineInstr *, 8> RemoveMIs;
7140b57cec5SDimitry Andric       for (MachineBasicBlock &MBB : MF) {
715349cc55cSDimitry Andric         for (MachineInstr &MI : MBB) {
7160b57cec5SDimitry Andric           bool InstRewrite;
7170b57cec5SDimitry Andric           if (OptimizationKind == VectorElem)
7180b57cec5SDimitry Andric             InstRewrite = optimizeVectElement(MI) ;
7190b57cec5SDimitry Andric           else
7200b57cec5SDimitry Andric             InstRewrite = optimizeLdStInterleave(MI);
7210b57cec5SDimitry Andric           if (InstRewrite) {
7220b57cec5SDimitry Andric             // Add MI to the list of instructions to be removed given that it
7230b57cec5SDimitry Andric             // has been replaced.
7240b57cec5SDimitry Andric             RemoveMIs.push_back(&MI);
7250b57cec5SDimitry Andric             Changed = true;
7260b57cec5SDimitry Andric           }
7270b57cec5SDimitry Andric         }
7280b57cec5SDimitry Andric       }
7290b57cec5SDimitry Andric       for (MachineInstr *MI : RemoveMIs)
7300b57cec5SDimitry Andric         MI->eraseFromParent();
7310b57cec5SDimitry Andric     }
7320b57cec5SDimitry Andric   }
7330b57cec5SDimitry Andric 
7340b57cec5SDimitry Andric   return Changed;
7350b57cec5SDimitry Andric }
7360b57cec5SDimitry Andric 
7370b57cec5SDimitry Andric /// Returns an instance of the high cost ASIMD instruction replacement
7380b57cec5SDimitry Andric /// optimization pass.
createAArch64SIMDInstrOptPass()7390b57cec5SDimitry Andric FunctionPass *llvm::createAArch64SIMDInstrOptPass() {
7400b57cec5SDimitry Andric   return new AArch64SIMDInstrOpt();
7410b57cec5SDimitry Andric }
742