xref: /llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp (revision 9ae92d70561bcc95a7f818920238e764253d9758)
1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the ARMSelectionDAGInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "ARMTargetTransformInfo.h"
14 #include "llvm/CodeGen/SelectionDAG.h"
15 #include "llvm/Support/CommandLine.h"
16 using namespace llvm;
17 
18 #define DEBUG_TYPE "arm-selectiondag-info"
19 
20 cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
21     "arm-memtransfer-tploop", cl::Hidden,
22     cl::desc("Control conversion of memcpy to "
23              "Tail predicated loops (WLSTP)"),
24     cl::init(TPLoop::ForceDisabled),
25     cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
26                           "Don't convert memcpy to TP loop."),
27                clEnumValN(TPLoop::ForceEnabled, "force-enabled",
28                           "Always convert memcpy to TP loop."),
29                clEnumValN(TPLoop::Allow, "allow",
30                           "Allow (may be subject to certain conditions) "
31                           "conversion of memcpy to TP loop.")));
32 
33 bool ARMSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
34   return Opcode >= ARMISD::FIRST_MEMORY_OPCODE &&
35          Opcode <= ARMISD::LAST_MEMORY_OPCODE;
36 }
37 
38 // Emit, if possible, a specialized version of the given Libcall. Typically this
39 // means selecting the appropriately aligned version, but we also convert memset
40 // of 0 into memclr.
41 SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
42     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
43     SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
44   const ARMSubtarget &Subtarget =
45       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
46   const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
47 
48   // Only use a specialized AEABI function if the default version of this
49   // Libcall is an AEABI function.
50   if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
51     return SDValue();
52 
53   // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
54   // able to translate memset to memclr and use the value to index the function
55   // name array.
56   enum {
57     AEABI_MEMCPY = 0,
58     AEABI_MEMMOVE,
59     AEABI_MEMSET,
60     AEABI_MEMCLR
61   } AEABILibcall;
62   switch (LC) {
63   case RTLIB::MEMCPY:
64     AEABILibcall = AEABI_MEMCPY;
65     break;
66   case RTLIB::MEMMOVE:
67     AEABILibcall = AEABI_MEMMOVE;
68     break;
69   case RTLIB::MEMSET:
70     AEABILibcall = AEABI_MEMSET;
71     if (isNullConstant(Src))
72       AEABILibcall = AEABI_MEMCLR;
73     break;
74   default:
75     return SDValue();
76   }
77 
78   // Choose the most-aligned libcall variant that we can
79   enum {
80     ALIGN1 = 0,
81     ALIGN4,
82     ALIGN8
83   } AlignVariant;
84   if ((Align & 7) == 0)
85     AlignVariant = ALIGN8;
86   else if ((Align & 3) == 0)
87     AlignVariant = ALIGN4;
88   else
89     AlignVariant = ALIGN1;
90 
91   TargetLowering::ArgListTy Args;
92   TargetLowering::ArgListEntry Entry;
93   Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
94   Entry.Node = Dst;
95   Args.push_back(Entry);
96   if (AEABILibcall == AEABI_MEMCLR) {
97     Entry.Node = Size;
98     Args.push_back(Entry);
99   } else if (AEABILibcall == AEABI_MEMSET) {
100     // Adjust parameters for memset, EABI uses format (ptr, size, value),
101     // GNU library uses (ptr, value, size)
102     // See RTABI section 4.3.4
103     Entry.Node = Size;
104     Args.push_back(Entry);
105 
106     // Extend or truncate the argument to be an i32 value for the call.
107     if (Src.getValueType().bitsGT(MVT::i32))
108       Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
109     else if (Src.getValueType().bitsLT(MVT::i32))
110       Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
111 
112     Entry.Node = Src;
113     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
114     Entry.IsSExt = false;
115     Args.push_back(Entry);
116   } else {
117     Entry.Node = Src;
118     Args.push_back(Entry);
119 
120     Entry.Node = Size;
121     Args.push_back(Entry);
122   }
123 
124   char const *FunctionNames[4][3] = {
125     { "__aeabi_memcpy",  "__aeabi_memcpy4",  "__aeabi_memcpy8"  },
126     { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
127     { "__aeabi_memset",  "__aeabi_memset4",  "__aeabi_memset8"  },
128     { "__aeabi_memclr",  "__aeabi_memclr4",  "__aeabi_memclr8"  }
129   };
130   TargetLowering::CallLoweringInfo CLI(DAG);
131   CLI.setDebugLoc(dl)
132       .setChain(Chain)
133       .setLibCallee(
134           TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
135           DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
136                                 TLI->getPointerTy(DAG.getDataLayout())),
137           std::move(Args))
138       .setDiscardResult();
139   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
140 
141   return CallResult.second;
142 }
143 
144 static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
145                                        const SelectionDAG &DAG,
146                                        ConstantSDNode *ConstantSize,
147                                        Align Alignment, bool IsMemcpy) {
148   auto &F = DAG.getMachineFunction().getFunction();
149   if (!EnableMemtransferTPLoop)
150     return false;
151   if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
152     return true;
153   // Do not generate inline TP loop if optimizations is disabled,
154   // or if optimization for size (-Os or -Oz) is on.
155   if (F.hasOptNone() || F.hasOptSize())
156     return false;
157   // If cli option is unset, for memset always generate inline TP.
158   // For memcpy, check some conditions
159   if (!IsMemcpy)
160     return true;
161   if (!ConstantSize && Alignment >= Align(4))
162     return true;
163   if (ConstantSize &&
164       ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
165       ConstantSize->getZExtValue() <
166           Subtarget.getMaxMemcpyTPInlineSizeThreshold())
167     return true;
168   return false;
169 }
170 
171 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
172     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
173     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
174     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
175   const ARMSubtarget &Subtarget =
176       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
177   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
178 
179   if (Subtarget.hasMVEIntegerOps() &&
180       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
181     return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
182                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
183 
184   // Do repeated 4-byte loads and stores. To be improved.
185   // This requires 4-byte alignment.
186   if (Alignment < Align(4))
187     return SDValue();
188   // This requires the copy size to be a constant, preferably
189   // within a subtarget-specific limit.
190   if (!ConstantSize)
191     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
192                                   Alignment.value(), RTLIB::MEMCPY);
193   uint64_t SizeVal = ConstantSize->getZExtValue();
194   if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
195     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
196                                   Alignment.value(), RTLIB::MEMCPY);
197 
198   unsigned BytesLeft = SizeVal & 3;
199   unsigned NumMemOps = SizeVal >> 2;
200   unsigned EmittedNumMemOps = 0;
201   EVT VT = MVT::i32;
202   unsigned VTSize = 4;
203   unsigned i = 0;
204   // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
205   const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
206   SDValue TFOps[6];
207   SDValue Loads[6];
208   uint64_t SrcOff = 0, DstOff = 0;
209 
210   // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
211   // VLDM/VSTM and make this code emit it when appropriate. This would reduce
212   // pressure on the general purpose registers. However this seems harder to map
213   // onto the register allocator's view of the world.
214 
215   // The number of MEMCPY pseudo-instructions to emit. We use up to
216   // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
217   // later on. This is a lower bound on the number of MEMCPY operations we must
218   // emit.
219   unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
220 
221   // Code size optimisation: do not inline memcpy if expansion results in
222   // more instructions than the libary call.
223   if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
224     return SDValue();
225   }
226 
227   SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
228 
229   for (unsigned I = 0; I != NumMEMCPYs; ++I) {
230     // Evenly distribute registers among MEMCPY operations to reduce register
231     // pressure.
232     unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
233     unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
234 
235     Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
236                       DAG.getConstant(NumRegs, dl, MVT::i32));
237     Src = Dst.getValue(1);
238     Chain = Dst.getValue(2);
239 
240     DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
241     SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
242 
243     EmittedNumMemOps = NextEmittedNumMemOps;
244   }
245 
246   if (BytesLeft == 0)
247     return Chain;
248 
249   // Issue loads / stores for the trailing (1 - 3) bytes.
250   auto getRemainingValueType = [](unsigned BytesLeft) {
251     return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
252   };
253   auto getRemainingSize = [](unsigned BytesLeft) {
254     return (BytesLeft >= 2) ? 2 : 1;
255   };
256 
257   unsigned BytesLeftSave = BytesLeft;
258   i = 0;
259   while (BytesLeft) {
260     VT = getRemainingValueType(BytesLeft);
261     VTSize = getRemainingSize(BytesLeft);
262     Loads[i] = DAG.getLoad(VT, dl, Chain,
263                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
264                                        DAG.getConstant(SrcOff, dl, MVT::i32)),
265                            SrcPtrInfo.getWithOffset(SrcOff));
266     TFOps[i] = Loads[i].getValue(1);
267     ++i;
268     SrcOff += VTSize;
269     BytesLeft -= VTSize;
270   }
271   Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
272 
273   i = 0;
274   BytesLeft = BytesLeftSave;
275   while (BytesLeft) {
276     VT = getRemainingValueType(BytesLeft);
277     VTSize = getRemainingSize(BytesLeft);
278     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
279                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
280                                         DAG.getConstant(DstOff, dl, MVT::i32)),
281                             DstPtrInfo.getWithOffset(DstOff));
282     ++i;
283     DstOff += VTSize;
284     BytesLeft -= VTSize;
285   }
286   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
287 }
288 
289 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
290     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
291     SDValue Size, Align Alignment, bool isVolatile,
292     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
293   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
294                                 Alignment.value(), RTLIB::MEMMOVE);
295 }
296 
297 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
298     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
299     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
300     MachinePointerInfo DstPtrInfo) const {
301 
302   const ARMSubtarget &Subtarget =
303       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
304 
305   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
306 
307   // Generate TP loop for llvm.memset
308   if (Subtarget.hasMVEIntegerOps() &&
309       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
310                                  false)) {
311     Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
312                                   DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
313     return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
314                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
315   }
316 
317   if (!AlwaysInline)
318     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
319                                   Alignment.value(), RTLIB::MEMSET);
320 
321   return SDValue();
322 }
323