xref: /netbsd-src/external/apache2/llvm/dist/llvm/lib/Target/AArch64/AArch64Subtarget.cpp (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64Subtarget.h"
14 
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/GlobalValue.h"
26 #include "llvm/Support/TargetParser.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "aarch64-subtarget"
31 
32 #define GET_SUBTARGETINFO_CTOR
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #include "AArch64GenSubtargetInfo.inc"
35 
36 static cl::opt<bool>
37 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
38                      "converter pass"), cl::init(true), cl::Hidden);
39 
40 // If OS supports TBI, use this flag to enable it.
41 static cl::opt<bool>
42 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
43                          "an address is ignored"), cl::init(false), cl::Hidden);
44 
45 static cl::opt<bool>
46     UseNonLazyBind("aarch64-enable-nonlazybind",
47                    cl::desc("Call nonlazybind functions via direct GOT load"),
48                    cl::init(false), cl::Hidden);
49 
50 static cl::opt<unsigned> SVEVectorBitsMax(
51     "aarch64-sve-vector-bits-max",
52     cl::desc("Assume SVE vector registers are at most this big, "
53              "with zero meaning no maximum size is assumed."),
54     cl::init(0), cl::Hidden);
55 
56 static cl::opt<unsigned> SVEVectorBitsMin(
57     "aarch64-sve-vector-bits-min",
58     cl::desc("Assume SVE vector registers are at least this big, "
59              "with zero meaning no minimum size is assumed."),
60     cl::init(0), cl::Hidden);
61 
62 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
63                            cl::desc("Enable the use of AA during codegen."));
64 
65 AArch64Subtarget &
initializeSubtargetDependencies(StringRef FS,StringRef CPUString)66 AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
67                                                   StringRef CPUString) {
68   // Determine default and user-specified characteristics
69 
70   if (CPUString.empty())
71     CPUString = "generic";
72 
73   ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS);
74   initializeProperties();
75 
76   return *this;
77 }
78 
initializeProperties()79 void AArch64Subtarget::initializeProperties() {
80   // Initialize CPU specific properties. We should add a tablegen feature for
81   // this in the future so we can specify it together with the subtarget
82   // features.
83   switch (ARMProcFamily) {
84   case Others:
85     break;
86   case Carmel:
87     CacheLineSize = 64;
88     break;
89   case CortexA35:
90     break;
91   case CortexA53:
92   case CortexA55:
93     PrefFunctionLogAlignment = 4;
94     break;
95   case CortexA57:
96     MaxInterleaveFactor = 4;
97     PrefFunctionLogAlignment = 4;
98     break;
99   case CortexA65:
100     PrefFunctionLogAlignment = 3;
101     break;
102   case CortexA72:
103   case CortexA73:
104   case CortexA75:
105   case CortexA76:
106   case CortexA77:
107   case CortexA78:
108   case CortexA78C:
109   case CortexR82:
110   case CortexX1:
111     PrefFunctionLogAlignment = 4;
112     break;
113   case A64FX:
114     CacheLineSize = 256;
115     PrefFunctionLogAlignment = 3;
116     PrefLoopLogAlignment = 2;
117     MaxInterleaveFactor = 4;
118     PrefetchDistance = 128;
119     MinPrefetchStride = 1024;
120     MaxPrefetchIterationsAhead = 4;
121     break;
122   case AppleA7:
123   case AppleA10:
124   case AppleA11:
125   case AppleA12:
126   case AppleA13:
127   case AppleA14:
128     CacheLineSize = 64;
129     PrefetchDistance = 280;
130     MinPrefetchStride = 2048;
131     MaxPrefetchIterationsAhead = 3;
132     break;
133   case ExynosM3:
134     MaxInterleaveFactor = 4;
135     MaxJumpTableSize = 20;
136     PrefFunctionLogAlignment = 5;
137     PrefLoopLogAlignment = 4;
138     break;
139   case Falkor:
140     MaxInterleaveFactor = 4;
141     // FIXME: remove this to enable 64-bit SLP if performance looks good.
142     MinVectorRegisterBitWidth = 128;
143     CacheLineSize = 128;
144     PrefetchDistance = 820;
145     MinPrefetchStride = 2048;
146     MaxPrefetchIterationsAhead = 8;
147     break;
148   case Kryo:
149     MaxInterleaveFactor = 4;
150     VectorInsertExtractBaseCost = 2;
151     CacheLineSize = 128;
152     PrefetchDistance = 740;
153     MinPrefetchStride = 1024;
154     MaxPrefetchIterationsAhead = 11;
155     // FIXME: remove this to enable 64-bit SLP if performance looks good.
156     MinVectorRegisterBitWidth = 128;
157     break;
158   case NeoverseE1:
159     PrefFunctionLogAlignment = 3;
160     break;
161   case NeoverseN1:
162   case NeoverseN2:
163   case NeoverseV1:
164     PrefFunctionLogAlignment = 4;
165     break;
166   case Saphira:
167     MaxInterleaveFactor = 4;
168     // FIXME: remove this to enable 64-bit SLP if performance looks good.
169     MinVectorRegisterBitWidth = 128;
170     break;
171   case ThunderX2T99:
172     CacheLineSize = 64;
173     PrefFunctionLogAlignment = 3;
174     PrefLoopLogAlignment = 2;
175     MaxInterleaveFactor = 4;
176     PrefetchDistance = 128;
177     MinPrefetchStride = 1024;
178     MaxPrefetchIterationsAhead = 4;
179     // FIXME: remove this to enable 64-bit SLP if performance looks good.
180     MinVectorRegisterBitWidth = 128;
181     break;
182   case ThunderX:
183   case ThunderXT88:
184   case ThunderXT81:
185   case ThunderXT83:
186     CacheLineSize = 128;
187     PrefFunctionLogAlignment = 3;
188     PrefLoopLogAlignment = 2;
189     // FIXME: remove this to enable 64-bit SLP if performance looks good.
190     MinVectorRegisterBitWidth = 128;
191     break;
192   case TSV110:
193     CacheLineSize = 64;
194     PrefFunctionLogAlignment = 4;
195     PrefLoopLogAlignment = 2;
196     break;
197   case ThunderX3T110:
198     CacheLineSize = 64;
199     PrefFunctionLogAlignment = 4;
200     PrefLoopLogAlignment = 2;
201     MaxInterleaveFactor = 4;
202     PrefetchDistance = 128;
203     MinPrefetchStride = 1024;
204     MaxPrefetchIterationsAhead = 4;
205     // FIXME: remove this to enable 64-bit SLP if performance looks good.
206     MinVectorRegisterBitWidth = 128;
207     break;
208   }
209 }
210 
AArch64Subtarget(const Triple & TT,const std::string & CPU,const std::string & FS,const TargetMachine & TM,bool LittleEndian)211 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
212                                    const std::string &FS,
213                                    const TargetMachine &TM, bool LittleEndian)
214     : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
215       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
216       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
217       IsLittle(LittleEndian),
218       TargetTriple(TT), FrameLowering(),
219       InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
220       TLInfo(TM, *this) {
221   if (AArch64::isX18ReservedByDefault(TT))
222     ReserveXRegister.set(18);
223 
224   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
225   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
226   Legalizer.reset(new AArch64LegalizerInfo(*this));
227 
228   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
229 
230   // FIXME: At this point, we can't rely on Subtarget having RBI.
231   // It's awkward to mix passing RBI and the Subtarget; should we pass
232   // TII/TRI as well?
233   InstSelector.reset(createAArch64InstructionSelector(
234       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
235 
236   RegBankInfo.reset(RBI);
237 }
238 
getCallLowering() const239 const CallLowering *AArch64Subtarget::getCallLowering() const {
240   return CallLoweringInfo.get();
241 }
242 
getInlineAsmLowering() const243 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
244   return InlineAsmLoweringInfo.get();
245 }
246 
getInstructionSelector() const247 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
248   return InstSelector.get();
249 }
250 
getLegalizerInfo() const251 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
252   return Legalizer.get();
253 }
254 
getRegBankInfo() const255 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
256   return RegBankInfo.get();
257 }
258 
259 /// Find the target operand flags that describe how a global value should be
260 /// referenced for the current subtarget.
261 unsigned
ClassifyGlobalReference(const GlobalValue * GV,const TargetMachine & TM) const262 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
263                                           const TargetMachine &TM) const {
264   // MachO large model always goes via a GOT, simply to get a single 8-byte
265   // absolute relocation on all global addresses.
266   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
267     return AArch64II::MO_GOT;
268 
269   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
270     if (GV->hasDLLImportStorageClass())
271       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
272     if (getTargetTriple().isOSWindows())
273       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
274     return AArch64II::MO_GOT;
275   }
276 
277   // The small code model's direct accesses use ADRP, which cannot
278   // necessarily produce the value 0 (if the code is above 4GB).
279   // Same for the tiny code model, where we have a pc relative LDR.
280   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
281       GV->hasExternalWeakLinkage())
282     return AArch64II::MO_GOT;
283 
284   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
285   // that their nominal addresses are tagged and outside of the code model. In
286   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
287   // tag if necessary based on MO_TAGGED.
288   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
289     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
290 
291   return AArch64II::MO_NO_FLAG;
292 }
293 
classifyGlobalFunctionReference(const GlobalValue * GV,const TargetMachine & TM) const294 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
295     const GlobalValue *GV, const TargetMachine &TM) const {
296   // MachO large model always goes via a GOT, because we don't have the
297   // relocations available to do anything else..
298   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
299       !GV->hasInternalLinkage())
300     return AArch64II::MO_GOT;
301 
302   // NonLazyBind goes via GOT unless we know it's available locally.
303   auto *F = dyn_cast<Function>(GV);
304   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
305       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
306     return AArch64II::MO_GOT;
307 
308   // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
309   if (getTargetTriple().isOSWindows())
310     return ClassifyGlobalReference(GV, TM);
311 
312   return AArch64II::MO_NO_FLAG;
313 }
314 
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const315 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
316                                            unsigned NumRegionInstrs) const {
317   // LNT run (at least on Cyclone) showed reasonably significant gains for
318   // bi-directional scheduling. 253.perlbmk.
319   Policy.OnlyTopDown = false;
320   Policy.OnlyBottomUp = false;
321   // Enabling or Disabling the latency heuristic is a close call: It seems to
322   // help nearly no benchmark on out-of-order architectures, on the other hand
323   // it regresses register pressure on a few benchmarking.
324   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
325 }
326 
enableEarlyIfConversion() const327 bool AArch64Subtarget::enableEarlyIfConversion() const {
328   return EnableEarlyIfConvert;
329 }
330 
supportsAddressTopByteIgnored() const331 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
332   if (!UseAddressTopByteIgnored)
333     return false;
334 
335   if (TargetTriple.isiOS()) {
336     unsigned Major, Minor, Micro;
337     TargetTriple.getiOSVersion(Major, Minor, Micro);
338     return Major >= 8;
339   }
340 
341   return false;
342 }
343 
344 std::unique_ptr<PBQPRAConstraint>
getCustomPBQPConstraints() const345 AArch64Subtarget::getCustomPBQPConstraints() const {
346   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
347 }
348 
mirFileLoaded(MachineFunction & MF) const349 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
350   // We usually compute max call frame size after ISel. Do the computation now
351   // if the .mir file didn't specify it. Note that this will probably give you
352   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
353   // instructions, specify explicitly if you need it to be correct.
354   MachineFrameInfo &MFI = MF.getFrameInfo();
355   if (!MFI.isMaxCallFrameSizeComputed())
356     MFI.computeMaxCallFrameSize(MF);
357 }
358 
getMaxSVEVectorSizeInBits() const359 unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const {
360   assert(HasSVE && "Tried to get SVE vector length without SVE support!");
361   assert(SVEVectorBitsMax % 128 == 0 &&
362          "SVE requires vector length in multiples of 128!");
363   assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
364          "Minimum SVE vector size should not be larger than its maximum!");
365   if (SVEVectorBitsMax == 0)
366     return 0;
367   return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
368 }
369 
getMinSVEVectorSizeInBits() const370 unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const {
371   assert(HasSVE && "Tried to get SVE vector length without SVE support!");
372   assert(SVEVectorBitsMin % 128 == 0 &&
373          "SVE requires vector length in multiples of 128!");
374   assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
375          "Minimum SVE vector size should not be larger than its maximum!");
376   if (SVEVectorBitsMax == 0)
377     return (SVEVectorBitsMin / 128) * 128;
378   return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
379 }
380 
useSVEForFixedLengthVectors() const381 bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
382   // Prefer NEON unless larger SVE registers are available.
383   return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
384 }
385 
useAA() const386 bool AArch64Subtarget::useAA() const { return UseAA; }
387