xref: /llvm-project/llvm/lib/Target/ARM/ARMLatencyMutations.cpp (revision 5be43db9b17e7cfc9e987f257221b0926551eb6e)
1 //===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file contains the ARM definition DAG scheduling mutations which
10 /// change inter-instruction latencies
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "ARMLatencyMutations.h"
15 #include "ARMSubtarget.h"
16 #include "Thumb2InstrInfo.h"
17 #include "llvm/Analysis/AliasAnalysis.h"
18 #include "llvm/CodeGen/ScheduleDAG.h"
19 #include "llvm/CodeGen/ScheduleDAGMutation.h"
20 #include "llvm/CodeGen/TargetInstrInfo.h"
21 #include <algorithm>
22 #include <array>
23 #include <initializer_list>
24 #include <memory>
25 
26 namespace llvm {
27 
28 namespace {
29 
30 // Precompute information about opcodes to speed up pass
31 
32 class InstructionInformation {
33 protected:
34   struct IInfo {
35     bool HasBRegAddr : 1;      // B-side of addr gen is a register
36     bool HasBRegAddrShift : 1; // B-side of addr gen has a shift
37     bool IsDivide : 1;         // Some form of integer divide
38     bool IsInlineShiftALU : 1; // Inline shift+ALU
39     bool IsMultiply : 1;       // Some form of integer multiply
40     bool IsMVEIntMAC : 1;      // MVE 8/16/32-bit integer MAC operation
41     bool IsNonSubwordLoad : 1; // Load which is a word or larger
42     bool IsShift : 1;          // Shift operation
43     bool IsRev : 1;            // REV operation
44     bool ProducesQP : 1;       // Produces a vector register result
45     bool ProducesDP : 1;       // Produces a double-precision register result
46     bool ProducesSP : 1;       // Produces a single-precision register result
47     bool ConsumesQP : 1;       // Consumes a vector register result
48     bool ConsumesDP : 1;       // Consumes a double-precision register result
49     bool ConsumesSP : 1;       // Consumes a single-precision register result
50     unsigned MVEIntMACMatched; // Matched operand type (for MVE)
51     unsigned AddressOpMask;    // Mask indicating which operands go into AGU
52     IInfo()
53         : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false),
54           IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false),
55           IsNonSubwordLoad(false), IsShift(false), IsRev(false),
56           ProducesQP(false), ProducesDP(false), ProducesSP(false),
57           ConsumesQP(false), ConsumesDP(false), ConsumesSP(false),
58           MVEIntMACMatched(0), AddressOpMask(0) {}
59   };
60   typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray;
61   IInfoArray Info;
62 
63 public:
64   // Always available information
65   unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; }
66   bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; }
67   bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; }
68   bool isDivide(unsigned Op) { return Info[Op].IsDivide; }
69   bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; }
70   bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; }
71   bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; }
72   bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; }
73   bool isRev(unsigned Op) { return Info[Op].IsRev; }
74   bool isShift(unsigned Op) { return Info[Op].IsShift; }
75 
76   // information available if markDPConsumers is called.
77   bool producesQP(unsigned Op) { return Info[Op].ProducesQP; }
78   bool producesDP(unsigned Op) { return Info[Op].ProducesDP; }
79   bool producesSP(unsigned Op) { return Info[Op].ProducesSP; }
80   bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; }
81   bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; }
82   bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; }
83 
84   bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) {
85     return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp;
86   }
87 
88   InstructionInformation(const ARMBaseInstrInfo *TII);
89 
90 protected:
91   void markDPProducersConsumers(const ARMBaseInstrInfo *TII);
92 };
93 
94 InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) {
95   using namespace ARM;
96 
97   std::initializer_list<unsigned> hasBRegAddrList = {
98       t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
99       tLDRr,  tLDRBr,  tLDRHr,  tSTRr,  tSTRBr,  tSTRHr,
100   };
101   for (auto op : hasBRegAddrList) {
102     Info[op].HasBRegAddr = true;
103   }
104 
105   std::initializer_list<unsigned> hasBRegAddrShiftList = {
106       t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
107   };
108   for (auto op : hasBRegAddrShiftList) {
109     Info[op].HasBRegAddrShift = true;
110   }
111 
112   Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
113 
114   std::initializer_list<unsigned> isInlineShiftALUList = {
115       t2ADCrs,  t2ADDSrs, t2ADDrs,  t2BICrs, t2EORrs,
116       t2ORNrs,  t2RSBSrs, t2RSBrs,  t2SBCrs, t2SUBrs,
117       t2SUBSrs, t2CMPrs,  t2CMNzrs, t2TEQrs, t2TSTrs,
118   };
119   for (auto op : isInlineShiftALUList) {
120     Info[op].IsInlineShiftALU = true;
121   }
122 
123   Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
124 
125   std::initializer_list<unsigned> isMultiplyList = {
126       t2MUL,    t2MLA,     t2MLS,     t2SMLABB, t2SMLABT,  t2SMLAD,   t2SMLADX,
127       t2SMLAL,  t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT,
128       t2SMLATB, t2SMLATT,  t2SMLAWT,  t2SMLSD,  t2SMLSDX,  t2SMLSLD,  t2SMLSLDX,
129       t2SMMLA,  t2SMMLAR,  t2SMMLS,   t2SMMLSR, t2SMMUL,   t2SMMULR,  t2SMUAD,
130       t2SMUADX, t2SMULBB,  t2SMULBT,  t2SMULL,  t2SMULTB,  t2SMULTT,  t2SMULWT,
131       t2SMUSD,  t2SMUSDX,  t2UMAAL,   t2UMLAL,  t2UMULL,   tMUL,
132   };
133   for (auto op : isMultiplyList) {
134     Info[op].IsMultiply = true;
135   }
136 
137   std::initializer_list<unsigned> isMVEIntMACList = {
138       MVE_VMLAS_qr_i16,    MVE_VMLAS_qr_i32,    MVE_VMLAS_qr_i8,
139       MVE_VMLA_qr_i16,     MVE_VMLA_qr_i32,     MVE_VMLA_qr_i8,
140       MVE_VQDMLAH_qrs16,   MVE_VQDMLAH_qrs32,   MVE_VQDMLAH_qrs8,
141       MVE_VQDMLASH_qrs16,  MVE_VQDMLASH_qrs32,  MVE_VQDMLASH_qrs8,
142       MVE_VQRDMLAH_qrs16,  MVE_VQRDMLAH_qrs32,  MVE_VQRDMLAH_qrs8,
143       MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8,
144       MVE_VQDMLADHXs16,    MVE_VQDMLADHXs32,    MVE_VQDMLADHXs8,
145       MVE_VQDMLADHs16,     MVE_VQDMLADHs32,     MVE_VQDMLADHs8,
146       MVE_VQDMLSDHXs16,    MVE_VQDMLSDHXs32,    MVE_VQDMLSDHXs8,
147       MVE_VQDMLSDHs16,     MVE_VQDMLSDHs32,     MVE_VQDMLSDHs8,
148       MVE_VQRDMLADHXs16,   MVE_VQRDMLADHXs32,   MVE_VQRDMLADHXs8,
149       MVE_VQRDMLADHs16,    MVE_VQRDMLADHs32,    MVE_VQRDMLADHs8,
150       MVE_VQRDMLSDHXs16,   MVE_VQRDMLSDHXs32,   MVE_VQRDMLSDHXs8,
151       MVE_VQRDMLSDHs16,    MVE_VQRDMLSDHs32,    MVE_VQRDMLSDHs8,
152   };
153   for (auto op : isMVEIntMACList) {
154     Info[op].IsMVEIntMAC = true;
155   }
156 
157   std::initializer_list<unsigned> isNonSubwordLoadList = {
158       t2LDRi12, t2LDRi8,  t2LDR_POST,  t2LDR_PRE,  t2LDRpci,
159       t2LDRs,   t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi,
160       tLDRpci,  tLDRr,    tLDRspi,
161   };
162   for (auto op : isNonSubwordLoadList) {
163     Info[op].IsNonSubwordLoad = true;
164   }
165 
166   std::initializer_list<unsigned> isRevList = {
167       t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH,
168   };
169   for (auto op : isRevList) {
170     Info[op].IsRev = true;
171   }
172 
173   std::initializer_list<unsigned> isShiftList = {
174       t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr,
175       tASRri,  tASRrr,  tLSLSri, tLSLri,  tLSLrr,  tLSRri,  tLSRrr,  tROR,
176   };
177   for (auto op : isShiftList) {
178     Info[op].IsShift = true;
179   }
180 
181   std::initializer_list<unsigned> Address1List = {
182       t2LDRBi12,
183       t2LDRBi8,
184       t2LDRBpci,
185       t2LDRBs,
186       t2LDRHi12,
187       t2LDRHi8,
188       t2LDRHpci,
189       t2LDRHs,
190       t2LDRSBi12,
191       t2LDRSBi8,
192       t2LDRSBpci,
193       t2LDRSBs,
194       t2LDRSHi12,
195       t2LDRSHi8,
196       t2LDRSHpci,
197       t2LDRSHs,
198       t2LDRi12,
199       t2LDRi8,
200       t2LDRpci,
201       t2LDRs,
202       tLDRBi,
203       tLDRBr,
204       tLDRHi,
205       tLDRHr,
206       tLDRSB,
207       tLDRSH,
208       tLDRi,
209       tLDRpci,
210       tLDRr,
211       tLDRspi,
212       t2STRBi12,
213       t2STRBi8,
214       t2STRBs,
215       t2STRHi12,
216       t2STRHi8,
217       t2STRHs,
218       t2STRi12,
219       t2STRi8,
220       t2STRs,
221       tSTRBi,
222       tSTRBr,
223       tSTRHi,
224       tSTRHr,
225       tSTRi,
226       tSTRr,
227       tSTRspi,
228       VLDRD,
229       VLDRH,
230       VLDRS,
231       VSTRD,
232       VSTRH,
233       VSTRS,
234       MVE_VLD20_16,
235       MVE_VLD20_32,
236       MVE_VLD20_8,
237       MVE_VLD21_16,
238       MVE_VLD21_32,
239       MVE_VLD21_8,
240       MVE_VLD40_16,
241       MVE_VLD40_32,
242       MVE_VLD40_8,
243       MVE_VLD41_16,
244       MVE_VLD41_32,
245       MVE_VLD41_8,
246       MVE_VLD42_16,
247       MVE_VLD42_32,
248       MVE_VLD42_8,
249       MVE_VLD43_16,
250       MVE_VLD43_32,
251       MVE_VLD43_8,
252       MVE_VLDRBS16,
253       MVE_VLDRBS16_rq,
254       MVE_VLDRBS32,
255       MVE_VLDRBS32_rq,
256       MVE_VLDRBU16,
257       MVE_VLDRBU16_rq,
258       MVE_VLDRBU32,
259       MVE_VLDRBU32_rq,
260       MVE_VLDRBU8,
261       MVE_VLDRBU8_rq,
262       MVE_VLDRDU64_qi,
263       MVE_VLDRDU64_rq,
264       MVE_VLDRDU64_rq_u,
265       MVE_VLDRHS32,
266       MVE_VLDRHS32_rq,
267       MVE_VLDRHS32_rq_u,
268       MVE_VLDRHU16,
269       MVE_VLDRHU16_rq,
270       MVE_VLDRHU16_rq_u,
271       MVE_VLDRHU32,
272       MVE_VLDRHU32_rq,
273       MVE_VLDRHU32_rq_u,
274       MVE_VLDRWU32,
275       MVE_VLDRWU32_qi,
276       MVE_VLDRWU32_rq,
277       MVE_VLDRWU32_rq_u,
278       MVE_VST20_16,
279       MVE_VST20_32,
280       MVE_VST20_8,
281       MVE_VST21_16,
282       MVE_VST21_32,
283       MVE_VST21_8,
284       MVE_VST40_16,
285       MVE_VST40_32,
286       MVE_VST40_8,
287       MVE_VST41_16,
288       MVE_VST41_32,
289       MVE_VST41_8,
290       MVE_VST42_16,
291       MVE_VST42_32,
292       MVE_VST42_8,
293       MVE_VST43_16,
294       MVE_VST43_32,
295       MVE_VST43_8,
296       MVE_VSTRB16,
297       MVE_VSTRB16_rq,
298       MVE_VSTRB32,
299       MVE_VSTRB32_rq,
300       MVE_VSTRBU8,
301       MVE_VSTRB8_rq,
302       MVE_VSTRD64_qi,
303       MVE_VSTRD64_rq,
304       MVE_VSTRD64_rq_u,
305       MVE_VSTRH32,
306       MVE_VSTRH32_rq,
307       MVE_VSTRH32_rq_u,
308       MVE_VSTRHU16,
309       MVE_VSTRH16_rq,
310       MVE_VSTRH16_rq_u,
311       MVE_VSTRWU32,
312       MVE_VSTRW32_qi,
313       MVE_VSTRW32_rq,
314       MVE_VSTRW32_rq_u,
315   };
316   std::initializer_list<unsigned> Address2List = {
317       t2LDRB_POST,
318       t2LDRB_PRE,
319       t2LDRDi8,
320       t2LDRH_POST,
321       t2LDRH_PRE,
322       t2LDRSB_POST,
323       t2LDRSB_PRE,
324       t2LDRSH_POST,
325       t2LDRSH_PRE,
326       t2LDR_POST,
327       t2LDR_PRE,
328       t2STRB_POST,
329       t2STRB_PRE,
330       t2STRDi8,
331       t2STRH_POST,
332       t2STRH_PRE,
333       t2STR_POST,
334       t2STR_PRE,
335       MVE_VLD20_16_wb,
336       MVE_VLD20_32_wb,
337       MVE_VLD20_8_wb,
338       MVE_VLD21_16_wb,
339       MVE_VLD21_32_wb,
340       MVE_VLD21_8_wb,
341       MVE_VLD40_16_wb,
342       MVE_VLD40_32_wb,
343       MVE_VLD40_8_wb,
344       MVE_VLD41_16_wb,
345       MVE_VLD41_32_wb,
346       MVE_VLD41_8_wb,
347       MVE_VLD42_16_wb,
348       MVE_VLD42_32_wb,
349       MVE_VLD42_8_wb,
350       MVE_VLD43_16_wb,
351       MVE_VLD43_32_wb,
352       MVE_VLD43_8_wb,
353       MVE_VLDRBS16_post,
354       MVE_VLDRBS16_pre,
355       MVE_VLDRBS32_post,
356       MVE_VLDRBS32_pre,
357       MVE_VLDRBU16_post,
358       MVE_VLDRBU16_pre,
359       MVE_VLDRBU32_post,
360       MVE_VLDRBU32_pre,
361       MVE_VLDRBU8_post,
362       MVE_VLDRBU8_pre,
363       MVE_VLDRDU64_qi_pre,
364       MVE_VLDRHS32_post,
365       MVE_VLDRHS32_pre,
366       MVE_VLDRHU16_post,
367       MVE_VLDRHU16_pre,
368       MVE_VLDRHU32_post,
369       MVE_VLDRHU32_pre,
370       MVE_VLDRWU32_post,
371       MVE_VLDRWU32_pre,
372       MVE_VLDRWU32_qi_pre,
373       MVE_VST20_16_wb,
374       MVE_VST20_32_wb,
375       MVE_VST20_8_wb,
376       MVE_VST21_16_wb,
377       MVE_VST21_32_wb,
378       MVE_VST21_8_wb,
379       MVE_VST40_16_wb,
380       MVE_VST40_32_wb,
381       MVE_VST40_8_wb,
382       MVE_VST41_16_wb,
383       MVE_VST41_32_wb,
384       MVE_VST41_8_wb,
385       MVE_VST42_16_wb,
386       MVE_VST42_32_wb,
387       MVE_VST42_8_wb,
388       MVE_VST43_16_wb,
389       MVE_VST43_32_wb,
390       MVE_VST43_8_wb,
391       MVE_VSTRB16_post,
392       MVE_VSTRB16_pre,
393       MVE_VSTRB32_post,
394       MVE_VSTRB32_pre,
395       MVE_VSTRBU8_post,
396       MVE_VSTRBU8_pre,
397       MVE_VSTRD64_qi_pre,
398       MVE_VSTRH32_post,
399       MVE_VSTRH32_pre,
400       MVE_VSTRHU16_post,
401       MVE_VSTRHU16_pre,
402       MVE_VSTRWU32_post,
403       MVE_VSTRWU32_pre,
404       MVE_VSTRW32_qi_pre,
405   };
406   std::initializer_list<unsigned> Address3List = {
407       t2LDRD_POST,
408       t2LDRD_PRE,
409       t2STRD_POST,
410       t2STRD_PRE,
411   };
412   // Compute a mask of which operands are involved in address computation
413   for (auto &op : Address1List) {
414     Info[op].AddressOpMask = 0x6;
415   }
416   for (auto &op : Address2List) {
417     Info[op].AddressOpMask = 0xc;
418   }
419   for (auto &op : Address3List) {
420     Info[op].AddressOpMask = 0x18;
421   }
422   for (auto &op : hasBRegAddrShiftList) {
423     Info[op].AddressOpMask |= 0x8;
424   }
425 }
426 
427 void InstructionInformation::markDPProducersConsumers(
428     const ARMBaseInstrInfo *TII) {
429   // Learn about all instructions which have FP source/dest registers
430   for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) {
431     const MCInstrDesc &MID = TII->get(MI);
432     auto Operands = MID.operands();
433     for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) {
434       bool MarkQP = false, MarkDP = false, MarkSP = false;
435       switch (Operands[OI].RegClass) {
436       case ARM::MQPRRegClassID:
437       case ARM::DPRRegClassID:
438       case ARM::DPR_8RegClassID:
439       case ARM::DPR_VFP2RegClassID:
440       case ARM::DPairRegClassID:
441       case ARM::DPairSpcRegClassID:
442       case ARM::DQuadRegClassID:
443       case ARM::DQuadSpcRegClassID:
444       case ARM::DTripleRegClassID:
445       case ARM::DTripleSpcRegClassID:
446         MarkDP = true;
447         break;
448       case ARM::QPRRegClassID:
449       case ARM::QPR_8RegClassID:
450       case ARM::QPR_VFP2RegClassID:
451       case ARM::QQPRRegClassID:
452       case ARM::QQQQPRRegClassID:
453         MarkQP = true;
454         break;
455       case ARM::SPRRegClassID:
456       case ARM::SPR_8RegClassID:
457       case ARM::FPWithVPRRegClassID:
458         MarkSP = true;
459         break;
460       default:
461         break;
462       }
463       if (MarkQP) {
464         if (OI < MID.getNumDefs())
465           Info[MI].ProducesQP = true;
466         else
467           Info[MI].ConsumesQP = true;
468       }
469       if (MarkDP) {
470         if (OI < MID.getNumDefs())
471           Info[MI].ProducesDP = true;
472         else
473           Info[MI].ConsumesDP = true;
474       }
475       if (MarkSP) {
476         if (OI < MID.getNumDefs())
477           Info[MI].ProducesSP = true;
478         else
479           Info[MI].ConsumesSP = true;
480       }
481     }
482   }
483 }
484 
485 } // anonymous namespace
486 
487 static bool hasImplicitCPSRUse(const MachineInstr *MI) {
488   return MI->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR);
489 }
490 
491 void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep,
492                                             unsigned latency) {
493   SDep Reverse = SrcDep;
494   Reverse.setSUnit(&SrcSU);
495   for (SDep &PDep : SrcDep.getSUnit()->Preds) {
496     if (PDep == Reverse) {
497       PDep.setLatency(latency);
498       SrcDep.getSUnit()->setDepthDirty();
499       break;
500     }
501   }
502   SrcDep.setLatency(latency);
503   SrcSU.setHeightDirty();
504 }
505 
506 static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) {
507   return (a & 0xe) != (b & 0xe);
508 }
509 
510 // Set output dependences to zero latency for processors which can
511 // simultaneously issue to the same register.  Returns true if a change
512 // was made.
513 bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) {
514   if (Dep.getKind() == SDep::Output) {
515     setBidirLatencies(ISU, Dep, 0);
516     return true;
517   }
518   return false;
519 }
520 
521 // The graph doesn't look inside of bundles to determine their
522 // scheduling boundaries and reports zero latency into and out of them
523 // (except for CPSR into the bundle, which has latency 1).
524 // Make some better scheduling assumptions:
525 // 1) CPSR uses have zero latency; other uses have incoming latency 1
526 // 2) CPSR defs retain a latency of zero; others have a latency of 1.
527 //
528 // Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise
529 unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) {
530 
531   SUnit &DepSU = *Dep.getSUnit();
532   const MachineInstr *SrcMI = ISU.getInstr();
533   unsigned SrcOpcode = SrcMI->getOpcode();
534   const MachineInstr *DstMI = DepSU.getInstr();
535   unsigned DstOpcode = DstMI->getOpcode();
536 
537   if (DstOpcode == ARM::BUNDLE && TII->isPredicated(*DstMI)) {
538     setBidirLatencies(
539         ISU, Dep,
540         (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1);
541     return 1;
542   }
543   if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(*SrcMI) &&
544       Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) {
545     setBidirLatencies(ISU, Dep, 1);
546     return 2;
547   }
548   return 0;
549 }
550 
551 // Determine whether there is a memory RAW hazard here and set up latency
552 // accordingly
553 bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep,
554                                           unsigned latency) {
555   if (!Dep.isNormalMemory())
556     return false;
557   auto &SrcInst = *ISU.getInstr();
558   auto &DstInst = *Dep.getSUnit()->getInstr();
559   if (!SrcInst.mayStore() || !DstInst.mayLoad())
560     return false;
561 
562   auto SrcMO = *SrcInst.memoperands().begin();
563   auto DstMO = *DstInst.memoperands().begin();
564   auto SrcVal = SrcMO->getValue();
565   auto DstVal = DstMO->getValue();
566   auto SrcPseudoVal = SrcMO->getPseudoValue();
567   auto DstPseudoVal = DstMO->getPseudoValue();
568   if (SrcVal && DstVal && AA->alias(SrcVal, DstVal) == AliasResult::MustAlias &&
569       SrcMO->getOffset() == DstMO->getOffset()) {
570     setBidirLatencies(ISU, Dep, latency);
571     return true;
572   } else if (SrcPseudoVal && DstPseudoVal &&
573              SrcPseudoVal->kind() == DstPseudoVal->kind() &&
574              SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) {
575     // Spills/fills
576     auto FS0 = cast<FixedStackPseudoSourceValue>(SrcPseudoVal);
577     auto FS1 = cast<FixedStackPseudoSourceValue>(DstPseudoVal);
578     if (FS0 == FS1) {
579       setBidirLatencies(ISU, Dep, latency);
580       return true;
581     }
582   }
583   return false;
584 }
585 
586 namespace {
587 
588 std::unique_ptr<InstructionInformation> II;
589 
590 class CortexM7InstructionInformation : public InstructionInformation {
591 public:
592   CortexM7InstructionInformation(const ARMBaseInstrInfo *TII)
593       : InstructionInformation(TII) {}
594 };
595 
596 class CortexM7Overrides : public ARMOverrideBypasses {
597 public:
598   CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
599       : ARMOverrideBypasses(TII, AA) {
600     if (!II)
601       II.reset(new CortexM7InstructionInformation(TII));
602   }
603 
604   void modifyBypasses(SUnit &) override;
605 };
606 
607 void CortexM7Overrides::modifyBypasses(SUnit &ISU) {
608   const MachineInstr *SrcMI = ISU.getInstr();
609   unsigned SrcOpcode = SrcMI->getOpcode();
610   bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
611 
612   // Walk the successors looking for latency overrides that are needed
613   for (SDep &Dep : ISU.Succs) {
614 
615     // Output dependences should have 0 latency, as M7 is able to
616     // schedule writers to the same register for simultaneous issue.
617     if (zeroOutputDependences(ISU, Dep))
618       continue;
619 
620     if (memoryRAWHazard(ISU, Dep, 4))
621       continue;
622 
623     // Ignore dependencies other than data
624     if (Dep.getKind() != SDep::Data)
625       continue;
626 
627     SUnit &DepSU = *Dep.getSUnit();
628     if (DepSU.isBoundaryNode())
629       continue;
630 
631     if (makeBundleAssumptions(ISU, Dep) == 1)
632       continue;
633 
634     const MachineInstr *DstMI = DepSU.getInstr();
635     unsigned DstOpcode = DstMI->getOpcode();
636 
637     // Word loads into any multiply or divide instruction are considered
638     // cannot bypass their scheduling stage. Didn't do this in the .td file
639     // because we cannot easily create a read advance that is 0 from certain
640     // writer classes and 1 from all the rest.
641     // (The other way around would have been easy.)
642     if (isNSWload && (II->isMultiply(DstOpcode) || II->isDivide(DstOpcode)))
643       setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
644 
645     // Word loads into B operand of a load/store are considered cannot bypass
646     // their scheduling stage. Cannot do in the .td file because
647     // need to decide between -1 and -2 for ReadAdvance
648     if (isNSWload && II->hasBRegAddr(DstOpcode) &&
649         DstMI->getOperand(2).getReg() == Dep.getReg())
650       setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
651 
652     // Multiplies into any address generation cannot bypass from EX3.  Cannot do
653     // in the .td file because need to decide between -1 and -2 for ReadAdvance
654     if (II->isMultiply(SrcOpcode)) {
655       unsigned OpMask = II->getAddressOpMask(DstOpcode) >> 1;
656       for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) {
657         if ((OpMask & 1) && DstMI->getOperand(i).isReg() &&
658             DstMI->getOperand(i).getReg() == Dep.getReg()) {
659           setBidirLatencies(ISU, Dep, 4); // first legal bypass is EX4->EX1
660           break;
661         }
662       }
663     }
664 
665     // Mismatched conditional producers take longer on M7; they end up looking
666     // like they were produced at EX3 and read at IS.
667     if (TII->isPredicated(*SrcMI) && Dep.isAssignedRegDep() &&
668         (SrcOpcode == ARM::BUNDLE ||
669          mismatchedPred(TII->getPredicate(*SrcMI),
670                         TII->getPredicate(*DstMI)))) {
671       unsigned Lat = 1;
672       // Operand A of shift+ALU is treated as an EX1 read instead of EX2.
673       if (II->isInlineShiftALU(DstOpcode) && DstMI->getOperand(3).getImm() &&
674           DstMI->getOperand(1).getReg() == Dep.getReg())
675         Lat = 2;
676       Lat = std::min(3u, Dep.getLatency() + Lat);
677       setBidirLatencies(ISU, Dep, std::max(Dep.getLatency(), Lat));
678     }
679 
680     // CC setter into conditional producer shouldn't have a latency of more
681     // than 1 unless it's due to an implicit read. (All the "true" readers
682     // of the condition code use an implicit read, and predicates use an
683     // explicit.)
684     if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
685         TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
686       setBidirLatencies(ISU, Dep, 1);
687 
688     // REV instructions cannot bypass directly into the EX1 shifter.  The
689     // code is slightly inexact as it doesn't attempt to ensure that the bypass
690     // is to the shifter operands.
691     if (II->isRev(SrcOpcode)) {
692       if (II->isInlineShiftALU(DstOpcode))
693         setBidirLatencies(ISU, Dep, 2);
694       else if (II->isShift(DstOpcode))
695         setBidirLatencies(ISU, Dep, 1);
696     }
697   }
698 }
699 
700 class M85InstructionInformation : public InstructionInformation {
701 public:
702   M85InstructionInformation(const ARMBaseInstrInfo *t)
703       : InstructionInformation(t) {
704     markDPProducersConsumers(t);
705   }
706 };
707 
708 class M85Overrides : public ARMOverrideBypasses {
709 public:
710   M85Overrides(const ARMBaseInstrInfo *t, AAResults *a)
711       : ARMOverrideBypasses(t, a) {
712     if (!II)
713       II.reset(new M85InstructionInformation(t));
714   }
715 
716   void modifyBypasses(SUnit &) override;
717 
718 private:
719   unsigned computeBypassStage(const MCSchedClassDesc *SCD);
720   signed modifyMixedWidthFP(const MachineInstr *SrcMI,
721                             const MachineInstr *DstMI, unsigned RegID,
722                             const MCSchedClassDesc *SCD);
723 };
724 
725 unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) {
726   auto SM = DAG->getSchedModel();
727   unsigned DefIdx = 0; // just look for the first output's timing
728   if (DefIdx < SCDesc->NumWriteLatencyEntries) {
729     // Lookup the definition's write latency in SubtargetInfo.
730     const MCWriteLatencyEntry *WLEntry =
731         SM->getSubtargetInfo()->getWriteLatencyEntry(SCDesc, DefIdx);
732     unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000;
733     if (Latency == 4)
734       return 2;
735     else if (Latency == 5)
736       return 3;
737     else if (Latency > 3)
738       return 3;
739     else
740       return Latency;
741   }
742   return 2;
743 }
744 
745 // Latency changes for bypassing between FP registers of different sizes:
746 //
747 // Note that mixed DP/SP are unlikely because of the semantics
748 // of C.  Mixed MVE/SP are quite common when MVE intrinsics are used.
749 signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI,
750                                         const MachineInstr *DstMI,
751                                         unsigned RegID,
752                                         const MCSchedClassDesc *SCD) {
753 
754   if (!II->producesSP(SrcMI->getOpcode()) &&
755       !II->producesDP(SrcMI->getOpcode()) &&
756       !II->producesQP(SrcMI->getOpcode()))
757     return 0;
758 
759   if (Register::isVirtualRegister(RegID)) {
760     if (II->producesSP(SrcMI->getOpcode()) &&
761         II->consumesDP(DstMI->getOpcode())) {
762       for (auto &OP : SrcMI->operands())
763         if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
764             OP.getSubReg() == ARM::ssub_1)
765           return 5 - computeBypassStage(SCD);
766     } else if (II->producesSP(SrcMI->getOpcode()) &&
767                II->consumesQP(DstMI->getOpcode())) {
768       for (auto &OP : SrcMI->operands())
769         if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
770             (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
771           return 5 - computeBypassStage(SCD) -
772                  ((OP.getSubReg() == ARM::ssub_2 ||
773                    OP.getSubReg() == ARM::ssub_3)
774                       ? 1
775                       : 0);
776     } else if (II->producesDP(SrcMI->getOpcode()) &&
777                II->consumesQP(DstMI->getOpcode())) {
778       for (auto &OP : SrcMI->operands())
779         if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
780             OP.getSubReg() == ARM::ssub_1)
781           return -1;
782     } else if (II->producesDP(SrcMI->getOpcode()) &&
783                II->consumesSP(DstMI->getOpcode())) {
784       for (auto &OP : DstMI->operands())
785         if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
786             OP.getSubReg() == ARM::ssub_1)
787           return 5 - computeBypassStage(SCD);
788     } else if (II->producesQP(SrcMI->getOpcode()) &&
789                II->consumesSP(DstMI->getOpcode())) {
790       for (auto &OP : DstMI->operands())
791         if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
792             (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
793           return 5 - computeBypassStage(SCD) +
794                  ((OP.getSubReg() == ARM::ssub_2 ||
795                    OP.getSubReg() == ARM::ssub_3)
796                       ? 1
797                       : 0);
798     } else if (II->producesQP(SrcMI->getOpcode()) &&
799                II->consumesDP(DstMI->getOpcode())) {
800       for (auto &OP : DstMI->operands())
801         if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
802             OP.getSubReg() == ARM::ssub_1)
803           return 1;
804     }
805   } else if (Register::isPhysicalRegister(RegID)) {
806     // Note that when the producer is narrower, not all of the producers
807     // may be present in the scheduling graph; somewhere earlier in the
808     // compiler, an implicit def/use of the aliased full register gets
809     // added to the producer, and so only that producer is seen as *the*
810     // single producer.  This behavior also has the unfortunate effect of
811     // serializing the producers in the compiler's view of things.
812     if (II->producesSP(SrcMI->getOpcode()) &&
813         II->consumesDP(DstMI->getOpcode())) {
814       for (auto &OP : SrcMI->operands())
815         if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
816             OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
817             (OP.getReg() == RegID ||
818              (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
819              (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
820           return 5 - computeBypassStage(SCD);
821     } else if (II->producesSP(SrcMI->getOpcode()) &&
822                II->consumesQP(DstMI->getOpcode())) {
823       for (auto &OP : SrcMI->operands())
824         if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
825             OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
826             (OP.getReg() == RegID ||
827              (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
828              (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
829           return 5 - computeBypassStage(SCD) -
830                  (((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0);
831     } else if (II->producesDP(SrcMI->getOpcode()) &&
832                II->consumesQP(DstMI->getOpcode())) {
833       for (auto &OP : SrcMI->operands())
834         if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 &&
835             OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 &&
836             (OP.getReg() == RegID ||
837              (OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID))
838           return -1;
839     } else if (II->producesDP(SrcMI->getOpcode()) &&
840                II->consumesSP(DstMI->getOpcode())) {
841       if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
842         return 5 - computeBypassStage(SCD);
843     } else if (II->producesQP(SrcMI->getOpcode()) &&
844                II->consumesSP(DstMI->getOpcode())) {
845       if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
846         return 5 - computeBypassStage(SCD) +
847                (((RegID - ARM::S0) / 2) % 2 ? 1 : 0);
848     } else if (II->producesQP(SrcMI->getOpcode()) &&
849                II->consumesDP(DstMI->getOpcode())) {
850       if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2)
851         return 1;
852     }
853   }
854   return 0;
855 }
856 
857 void M85Overrides::modifyBypasses(SUnit &ISU) {
858   const MachineInstr *SrcMI = ISU.getInstr();
859   unsigned SrcOpcode = SrcMI->getOpcode();
860   bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
861 
862   // Walk the successors looking for latency overrides that are needed
863   for (SDep &Dep : ISU.Succs) {
864 
865     // Output dependences should have 0 latency, as CortexM85 is able to
866     // schedule writers to the same register for simultaneous issue.
867     if (zeroOutputDependences(ISU, Dep))
868       continue;
869 
870     if (memoryRAWHazard(ISU, Dep, 3))
871       continue;
872 
873     // Ignore dependencies other than data or strong ordering.
874     if (Dep.getKind() != SDep::Data)
875       continue;
876 
877     SUnit &DepSU = *Dep.getSUnit();
878     if (DepSU.isBoundaryNode())
879       continue;
880 
881     if (makeBundleAssumptions(ISU, Dep) == 1)
882       continue;
883 
884     const MachineInstr *DstMI = DepSU.getInstr();
885     unsigned DstOpcode = DstMI->getOpcode();
886 
887     // Word loads into B operand of a load/store with cannot bypass their
888     // scheduling stage. Cannot do in the .td file because need to decide
889     // between -1 and -2 for ReadAdvance
890 
891     if (isNSWload && II->hasBRegAddrShift(DstOpcode) &&
892         DstMI->getOperand(3).getImm() != 0 && // shift operand
893         DstMI->getOperand(2).getReg() == Dep.getReg())
894       setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
895 
896     if (isNSWload && isMVEVectorInstruction(DstMI)) {
897       setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
898     }
899 
900     if (II->isMVEIntMAC(DstOpcode) &&
901         II->isMVEIntMACMatched(SrcOpcode, DstOpcode) &&
902         DstMI->getOperand(0).isReg() &&
903         DstMI->getOperand(0).getReg() == Dep.getReg())
904       setBidirLatencies(ISU, Dep, Dep.getLatency() - 1);
905 
906     // CC setter into conditional producer shouldn't have a latency of more
907     // than 0 unless it's due to an implicit read.
908     if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
909         TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
910       setBidirLatencies(ISU, Dep, 0);
911 
912     if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, Dep.getReg(),
913                                          DAG->getSchedClass(&ISU)))
914       setBidirLatencies(ISU, Dep, std::max(0, signed(Dep.getLatency()) + ALat));
915 
916     if (II->isRev(SrcOpcode)) {
917       if (II->isInlineShiftALU(DstOpcode))
918         setBidirLatencies(ISU, Dep, 1);
919       else if (II->isShift(DstOpcode))
920         setBidirLatencies(ISU, Dep, 1);
921     }
922   }
923 }
924 
925 // Add M55 specific overrides for latencies between instructions. Currently it:
926 //  - Adds an extra cycle latency between MVE VMLAV and scalar instructions.
927 class CortexM55Overrides : public ARMOverrideBypasses {
928 public:
929   CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
930       : ARMOverrideBypasses(TII, AA) {}
931 
932   void modifyBypasses(SUnit &SU) override {
933     MachineInstr *SrcMI = SU.getInstr();
934     if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction))
935       return;
936 
937     for (SDep &Dep : SU.Succs) {
938       if (Dep.getKind() != SDep::Data)
939         continue;
940       SUnit &DepSU = *Dep.getSUnit();
941       if (DepSU.isBoundaryNode())
942         continue;
943       MachineInstr *DstMI = DepSU.getInstr();
944 
945       if (!isMVEVectorInstruction(DstMI) && !DstMI->mayStore())
946         setBidirLatencies(SU, Dep, 3);
947     }
948   }
949 };
950 
951 } // end anonymous namespace
952 
953 void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) {
954   DAG = DAGInstrs;
955   for (SUnit &ISU : DAGInstrs->SUnits) {
956     if (ISU.isBoundaryNode())
957       continue;
958     modifyBypasses(ISU);
959   }
960   if (DAGInstrs->ExitSU.getInstr())
961     modifyBypasses(DAGInstrs->ExitSU);
962 }
963 
964 std::unique_ptr<ScheduleDAGMutation>
965 createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) {
966   if (ST.isCortexM85())
967     return std::make_unique<M85Overrides>(ST.getInstrInfo(), AA);
968   else if (ST.isCortexM7())
969     return std::make_unique<CortexM7Overrides>(ST.getInstrInfo(), AA);
970   else if (ST.isCortexM55())
971     return std::make_unique<CortexM55Overrides>(ST.getInstrInfo(), AA);
972 
973   return nullptr;
974 }
975 
976 } // end namespace llvm
977