1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "AArch64Subtarget.h"
14
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/GlobalValue.h"
26 #include "llvm/Support/TargetParser.h"
27
28 using namespace llvm;
29
30 #define DEBUG_TYPE "aarch64-subtarget"
31
32 #define GET_SUBTARGETINFO_CTOR
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #include "AArch64GenSubtargetInfo.inc"
35
36 static cl::opt<bool>
37 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
38 "converter pass"), cl::init(true), cl::Hidden);
39
40 // If OS supports TBI, use this flag to enable it.
41 static cl::opt<bool>
42 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
43 "an address is ignored"), cl::init(false), cl::Hidden);
44
45 static cl::opt<bool>
46 UseNonLazyBind("aarch64-enable-nonlazybind",
47 cl::desc("Call nonlazybind functions via direct GOT load"),
48 cl::init(false), cl::Hidden);
49
50 static cl::opt<unsigned> SVEVectorBitsMax(
51 "aarch64-sve-vector-bits-max",
52 cl::desc("Assume SVE vector registers are at most this big, "
53 "with zero meaning no maximum size is assumed."),
54 cl::init(0), cl::Hidden);
55
56 static cl::opt<unsigned> SVEVectorBitsMin(
57 "aarch64-sve-vector-bits-min",
58 cl::desc("Assume SVE vector registers are at least this big, "
59 "with zero meaning no minimum size is assumed."),
60 cl::init(0), cl::Hidden);
61
62 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
63 cl::desc("Enable the use of AA during codegen."));
64
65 AArch64Subtarget &
initializeSubtargetDependencies(StringRef FS,StringRef CPUString)66 AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
67 StringRef CPUString) {
68 // Determine default and user-specified characteristics
69
70 if (CPUString.empty())
71 CPUString = "generic";
72
73 ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS);
74 initializeProperties();
75
76 return *this;
77 }
78
initializeProperties()79 void AArch64Subtarget::initializeProperties() {
80 // Initialize CPU specific properties. We should add a tablegen feature for
81 // this in the future so we can specify it together with the subtarget
82 // features.
83 switch (ARMProcFamily) {
84 case Others:
85 break;
86 case Carmel:
87 CacheLineSize = 64;
88 break;
89 case CortexA35:
90 break;
91 case CortexA53:
92 case CortexA55:
93 PrefFunctionLogAlignment = 4;
94 break;
95 case CortexA57:
96 MaxInterleaveFactor = 4;
97 PrefFunctionLogAlignment = 4;
98 break;
99 case CortexA65:
100 PrefFunctionLogAlignment = 3;
101 break;
102 case CortexA72:
103 case CortexA73:
104 case CortexA75:
105 case CortexA76:
106 case CortexA77:
107 case CortexA78:
108 case CortexA78C:
109 case CortexR82:
110 case CortexX1:
111 PrefFunctionLogAlignment = 4;
112 break;
113 case A64FX:
114 CacheLineSize = 256;
115 PrefFunctionLogAlignment = 3;
116 PrefLoopLogAlignment = 2;
117 MaxInterleaveFactor = 4;
118 PrefetchDistance = 128;
119 MinPrefetchStride = 1024;
120 MaxPrefetchIterationsAhead = 4;
121 break;
122 case AppleA7:
123 case AppleA10:
124 case AppleA11:
125 case AppleA12:
126 case AppleA13:
127 case AppleA14:
128 CacheLineSize = 64;
129 PrefetchDistance = 280;
130 MinPrefetchStride = 2048;
131 MaxPrefetchIterationsAhead = 3;
132 break;
133 case ExynosM3:
134 MaxInterleaveFactor = 4;
135 MaxJumpTableSize = 20;
136 PrefFunctionLogAlignment = 5;
137 PrefLoopLogAlignment = 4;
138 break;
139 case Falkor:
140 MaxInterleaveFactor = 4;
141 // FIXME: remove this to enable 64-bit SLP if performance looks good.
142 MinVectorRegisterBitWidth = 128;
143 CacheLineSize = 128;
144 PrefetchDistance = 820;
145 MinPrefetchStride = 2048;
146 MaxPrefetchIterationsAhead = 8;
147 break;
148 case Kryo:
149 MaxInterleaveFactor = 4;
150 VectorInsertExtractBaseCost = 2;
151 CacheLineSize = 128;
152 PrefetchDistance = 740;
153 MinPrefetchStride = 1024;
154 MaxPrefetchIterationsAhead = 11;
155 // FIXME: remove this to enable 64-bit SLP if performance looks good.
156 MinVectorRegisterBitWidth = 128;
157 break;
158 case NeoverseE1:
159 PrefFunctionLogAlignment = 3;
160 break;
161 case NeoverseN1:
162 case NeoverseN2:
163 case NeoverseV1:
164 PrefFunctionLogAlignment = 4;
165 break;
166 case Saphira:
167 MaxInterleaveFactor = 4;
168 // FIXME: remove this to enable 64-bit SLP if performance looks good.
169 MinVectorRegisterBitWidth = 128;
170 break;
171 case ThunderX2T99:
172 CacheLineSize = 64;
173 PrefFunctionLogAlignment = 3;
174 PrefLoopLogAlignment = 2;
175 MaxInterleaveFactor = 4;
176 PrefetchDistance = 128;
177 MinPrefetchStride = 1024;
178 MaxPrefetchIterationsAhead = 4;
179 // FIXME: remove this to enable 64-bit SLP if performance looks good.
180 MinVectorRegisterBitWidth = 128;
181 break;
182 case ThunderX:
183 case ThunderXT88:
184 case ThunderXT81:
185 case ThunderXT83:
186 CacheLineSize = 128;
187 PrefFunctionLogAlignment = 3;
188 PrefLoopLogAlignment = 2;
189 // FIXME: remove this to enable 64-bit SLP if performance looks good.
190 MinVectorRegisterBitWidth = 128;
191 break;
192 case TSV110:
193 CacheLineSize = 64;
194 PrefFunctionLogAlignment = 4;
195 PrefLoopLogAlignment = 2;
196 break;
197 case ThunderX3T110:
198 CacheLineSize = 64;
199 PrefFunctionLogAlignment = 4;
200 PrefLoopLogAlignment = 2;
201 MaxInterleaveFactor = 4;
202 PrefetchDistance = 128;
203 MinPrefetchStride = 1024;
204 MaxPrefetchIterationsAhead = 4;
205 // FIXME: remove this to enable 64-bit SLP if performance looks good.
206 MinVectorRegisterBitWidth = 128;
207 break;
208 }
209 }
210
AArch64Subtarget(const Triple & TT,const std::string & CPU,const std::string & FS,const TargetMachine & TM,bool LittleEndian)211 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
212 const std::string &FS,
213 const TargetMachine &TM, bool LittleEndian)
214 : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
215 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
216 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
217 IsLittle(LittleEndian),
218 TargetTriple(TT), FrameLowering(),
219 InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
220 TLInfo(TM, *this) {
221 if (AArch64::isX18ReservedByDefault(TT))
222 ReserveXRegister.set(18);
223
224 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
225 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
226 Legalizer.reset(new AArch64LegalizerInfo(*this));
227
228 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
229
230 // FIXME: At this point, we can't rely on Subtarget having RBI.
231 // It's awkward to mix passing RBI and the Subtarget; should we pass
232 // TII/TRI as well?
233 InstSelector.reset(createAArch64InstructionSelector(
234 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
235
236 RegBankInfo.reset(RBI);
237 }
238
getCallLowering() const239 const CallLowering *AArch64Subtarget::getCallLowering() const {
240 return CallLoweringInfo.get();
241 }
242
getInlineAsmLowering() const243 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
244 return InlineAsmLoweringInfo.get();
245 }
246
getInstructionSelector() const247 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
248 return InstSelector.get();
249 }
250
getLegalizerInfo() const251 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
252 return Legalizer.get();
253 }
254
getRegBankInfo() const255 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
256 return RegBankInfo.get();
257 }
258
259 /// Find the target operand flags that describe how a global value should be
260 /// referenced for the current subtarget.
261 unsigned
ClassifyGlobalReference(const GlobalValue * GV,const TargetMachine & TM) const262 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
263 const TargetMachine &TM) const {
264 // MachO large model always goes via a GOT, simply to get a single 8-byte
265 // absolute relocation on all global addresses.
266 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
267 return AArch64II::MO_GOT;
268
269 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
270 if (GV->hasDLLImportStorageClass())
271 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
272 if (getTargetTriple().isOSWindows())
273 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
274 return AArch64II::MO_GOT;
275 }
276
277 // The small code model's direct accesses use ADRP, which cannot
278 // necessarily produce the value 0 (if the code is above 4GB).
279 // Same for the tiny code model, where we have a pc relative LDR.
280 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
281 GV->hasExternalWeakLinkage())
282 return AArch64II::MO_GOT;
283
284 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
285 // that their nominal addresses are tagged and outside of the code model. In
286 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
287 // tag if necessary based on MO_TAGGED.
288 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
289 return AArch64II::MO_NC | AArch64II::MO_TAGGED;
290
291 return AArch64II::MO_NO_FLAG;
292 }
293
classifyGlobalFunctionReference(const GlobalValue * GV,const TargetMachine & TM) const294 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
295 const GlobalValue *GV, const TargetMachine &TM) const {
296 // MachO large model always goes via a GOT, because we don't have the
297 // relocations available to do anything else..
298 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
299 !GV->hasInternalLinkage())
300 return AArch64II::MO_GOT;
301
302 // NonLazyBind goes via GOT unless we know it's available locally.
303 auto *F = dyn_cast<Function>(GV);
304 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
305 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
306 return AArch64II::MO_GOT;
307
308 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
309 if (getTargetTriple().isOSWindows())
310 return ClassifyGlobalReference(GV, TM);
311
312 return AArch64II::MO_NO_FLAG;
313 }
314
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const315 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
316 unsigned NumRegionInstrs) const {
317 // LNT run (at least on Cyclone) showed reasonably significant gains for
318 // bi-directional scheduling. 253.perlbmk.
319 Policy.OnlyTopDown = false;
320 Policy.OnlyBottomUp = false;
321 // Enabling or Disabling the latency heuristic is a close call: It seems to
322 // help nearly no benchmark on out-of-order architectures, on the other hand
323 // it regresses register pressure on a few benchmarking.
324 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
325 }
326
enableEarlyIfConversion() const327 bool AArch64Subtarget::enableEarlyIfConversion() const {
328 return EnableEarlyIfConvert;
329 }
330
supportsAddressTopByteIgnored() const331 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
332 if (!UseAddressTopByteIgnored)
333 return false;
334
335 if (TargetTriple.isiOS()) {
336 unsigned Major, Minor, Micro;
337 TargetTriple.getiOSVersion(Major, Minor, Micro);
338 return Major >= 8;
339 }
340
341 return false;
342 }
343
344 std::unique_ptr<PBQPRAConstraint>
getCustomPBQPConstraints() const345 AArch64Subtarget::getCustomPBQPConstraints() const {
346 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
347 }
348
mirFileLoaded(MachineFunction & MF) const349 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
350 // We usually compute max call frame size after ISel. Do the computation now
351 // if the .mir file didn't specify it. Note that this will probably give you
352 // bogus values after PEI has eliminated the callframe setup/destroy pseudo
353 // instructions, specify explicitly if you need it to be correct.
354 MachineFrameInfo &MFI = MF.getFrameInfo();
355 if (!MFI.isMaxCallFrameSizeComputed())
356 MFI.computeMaxCallFrameSize(MF);
357 }
358
getMaxSVEVectorSizeInBits() const359 unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const {
360 assert(HasSVE && "Tried to get SVE vector length without SVE support!");
361 assert(SVEVectorBitsMax % 128 == 0 &&
362 "SVE requires vector length in multiples of 128!");
363 assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
364 "Minimum SVE vector size should not be larger than its maximum!");
365 if (SVEVectorBitsMax == 0)
366 return 0;
367 return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
368 }
369
getMinSVEVectorSizeInBits() const370 unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const {
371 assert(HasSVE && "Tried to get SVE vector length without SVE support!");
372 assert(SVEVectorBitsMin % 128 == 0 &&
373 "SVE requires vector length in multiples of 128!");
374 assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
375 "Minimum SVE vector size should not be larger than its maximum!");
376 if (SVEVectorBitsMax == 0)
377 return (SVEVectorBitsMin / 128) * 128;
378 return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
379 }
380
useSVEForFixedLengthVectors() const381 bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
382 // Prefer NEON unless larger SVE registers are available.
383 return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
384 }
385
useAA() const386 bool AArch64Subtarget::useAA() const { return UseAA; }
387