xref: /llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp (revision dfe43bd1ca46c59399b7cbbf81b09256232e27f9)
1 //===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the X86SelectionDAGInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "X86SelectionDAGInfo.h"
14 #include "X86ISelLowering.h"
15 #include "X86InstrInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/SelectionDAG.h"
20 #include "llvm/CodeGen/TargetLowering.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "x86-selectiondag-info"
25 
26 static cl::opt<bool>
27     UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
28                      cl::desc("Use fast short rep mov in memcpy lowering"));
29 
30 /// Returns the best type to use with repmovs/repstos depending on alignment.
31 static MVT getOptimalRepType(const X86Subtarget &Subtarget, Align Alignment) {
32   uint64_t Align = Alignment.value();
33   assert((Align != 0) && "Align is normalized");
34   assert(isPowerOf2_64(Align) && "Align is a power of 2");
35   switch (Align) {
36   case 1:
37     return MVT::i8;
38   case 2:
39     return MVT::i16;
40   case 4:
41     return MVT::i32;
42   default:
43     return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
44   }
45 }
46 
47 bool X86SelectionDAGInfo::isBaseRegConflictPossible(
48     SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
49   // We cannot use TRI->hasBasePointer() until *after* we select all basic
50   // blocks.  Legalization may introduce new stack temporaries with large
51   // alignment requirements.  Fall back to generic code if there are any
52   // dynamic stack adjustments (hopefully rare) and the base pointer would
53   // conflict if we had to use it.
54   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
55   if (!MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment())
56     return false;
57 
58   const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
59       DAG.getSubtarget().getRegisterInfo());
60   return llvm::is_contained(ClobberSet, TRI->getBaseRegister());
61 }
62 
63 /// Emit a single REP STOSB instruction for a particular constant size.
64 static SDValue emitRepstos(const X86Subtarget &Subtarget, SelectionDAG &DAG,
65                            const SDLoc &dl, SDValue Chain, SDValue Dst,
66                            SDValue Val, SDValue Size, MVT AVT) {
67   const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
68   unsigned AX = X86::AL;
69   switch (AVT.getSizeInBits()) {
70   case 8:
71     AX = X86::AL;
72     break;
73   case 16:
74     AX = X86::AX;
75     break;
76   case 32:
77     AX = X86::EAX;
78     break;
79   default:
80     AX = X86::RAX;
81     break;
82   }
83 
84   const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
85   const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
86 
87   SDValue InGlue;
88   Chain = DAG.getCopyToReg(Chain, dl, AX, Val, InGlue);
89   InGlue = Chain.getValue(1);
90   Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InGlue);
91   InGlue = Chain.getValue(1);
92   Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InGlue);
93   InGlue = Chain.getValue(1);
94 
95   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
96   SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
97   return DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
98 }
99 
100 /// Emit a single REP STOSB instruction for a particular constant size.
101 static SDValue emitRepstosB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
102                             const SDLoc &dl, SDValue Chain, SDValue Dst,
103                             SDValue Val, uint64_t Size) {
104   return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
105                      DAG.getIntPtrConstant(Size, dl), MVT::i8);
106 }
107 
108 /// Returns a REP STOS instruction, possibly with a few load/stores to implement
109 /// a constant size memory set. In some cases where we know REP MOVS is
110 /// inefficient we return an empty SDValue so the calling code can either
111 /// generate a store sequence or call the runtime memset function.
112 static SDValue emitConstantSizeRepstos(SelectionDAG &DAG,
113                                        const X86Subtarget &Subtarget,
114                                        const SDLoc &dl, SDValue Chain,
115                                        SDValue Dst, SDValue Val, uint64_t Size,
116                                        EVT SizeVT, Align Alignment,
117                                        bool isVolatile, bool AlwaysInline,
118                                        MachinePointerInfo DstPtrInfo) {
119   /// In case we optimize for size, we use repstosb even if it's less efficient
120   /// so we can save the loads/stores of the leftover.
121   if (DAG.getMachineFunction().getFunction().hasMinSize()) {
122     if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
123       // Special case 0 because otherwise we get large literals,
124       // which causes larger encoding.
125       if ((Size & 31) == 0 && (ValC->getZExtValue() & 255) == 0) {
126         MVT BlockType = MVT::i32;
127         const uint64_t BlockBits = BlockType.getSizeInBits();
128         const uint64_t BlockBytes = BlockBits / 8;
129         const uint64_t BlockCount = Size / BlockBytes;
130 
131         Val = DAG.getConstant(0, dl, BlockType);
132         // repstosd is same size as repstosb
133         return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
134                            DAG.getIntPtrConstant(BlockCount, dl), BlockType);
135       }
136     }
137     return emitRepstosB(Subtarget, DAG, dl, Chain, Dst, Val, Size);
138   }
139 
140   if (Size > Subtarget.getMaxInlineSizeThreshold())
141     return SDValue();
142 
143   // If not DWORD aligned or size is more than the threshold, call the library.
144   // The libc version is likely to be faster for these cases. It can use the
145   // address value and run time information about the CPU.
146   if (Alignment < Align(4))
147     return SDValue();
148 
149   MVT BlockType = MVT::i8;
150   uint64_t BlockCount = Size;
151   uint64_t BytesLeft = 0;
152 
153   SDValue OriginalVal = Val;
154   if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
155     BlockType = getOptimalRepType(Subtarget, Alignment);
156     uint64_t Value = ValC->getZExtValue() & 255;
157     const uint64_t BlockBits = BlockType.getSizeInBits();
158 
159     if (BlockBits >= 16)
160       Value = (Value << 8) | Value;
161 
162     if (BlockBits >= 32)
163       Value = (Value << 16) | Value;
164 
165     if (BlockBits >= 64)
166       Value = (Value << 32) | Value;
167 
168     const uint64_t BlockBytes = BlockBits / 8;
169     BlockCount = Size / BlockBytes;
170     BytesLeft = Size % BlockBytes;
171     Val = DAG.getConstant(Value, dl, BlockType);
172   }
173 
174   SDValue RepStos =
175       emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
176                   DAG.getIntPtrConstant(BlockCount, dl), BlockType);
177   /// RepStos can process the whole length.
178   if (BytesLeft == 0)
179     return RepStos;
180 
181   // Handle the last 1 - 7 bytes.
182   SmallVector<SDValue, 4> Results;
183   Results.push_back(RepStos);
184   unsigned Offset = Size - BytesLeft;
185   EVT AddrVT = Dst.getValueType();
186 
187   Results.push_back(
188       DAG.getMemset(Chain, dl,
189                     DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
190                                 DAG.getConstant(Offset, dl, AddrVT)),
191                     OriginalVal, DAG.getConstant(BytesLeft, dl, SizeVT),
192                     Alignment, isVolatile, AlwaysInline,
193                     /* CI */ nullptr, DstPtrInfo.getWithOffset(Offset)));
194 
195   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
196 }
197 
198 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
199     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
200     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
201     MachinePointerInfo DstPtrInfo) const {
202   // If to a segment-relative address space, use the default lowering.
203   if (DstPtrInfo.getAddrSpace() >= 256)
204     return SDValue();
205 
206   // If the base register might conflict with our physical registers, bail out.
207   const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
208                                   X86::ECX, X86::EAX, X86::EDI};
209   if (isBaseRegConflictPossible(DAG, ClobberSet))
210     return SDValue();
211 
212   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
213   if (!ConstantSize)
214     return SDValue();
215 
216   const X86Subtarget &Subtarget =
217       DAG.getMachineFunction().getSubtarget<X86Subtarget>();
218   return emitConstantSizeRepstos(
219       DAG, Subtarget, dl, Chain, Dst, Val, ConstantSize->getZExtValue(),
220       Size.getValueType(), Alignment, isVolatile, AlwaysInline, DstPtrInfo);
221 }
222 
223 /// Emit a single REP MOVS{B,W,D,Q} instruction.
224 static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
225                            const SDLoc &dl, SDValue Chain, SDValue Dst,
226                            SDValue Src, SDValue Size, MVT AVT) {
227   const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
228   const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
229   const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
230   const unsigned SI = Use64BitRegs ? X86::RSI : X86::ESI;
231 
232   SDValue InGlue;
233   Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InGlue);
234   InGlue = Chain.getValue(1);
235   Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InGlue);
236   InGlue = Chain.getValue(1);
237   Chain = DAG.getCopyToReg(Chain, dl, SI, Src, InGlue);
238   InGlue = Chain.getValue(1);
239 
240   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
241   SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
242   return DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
243 }
244 
245 /// Emit a single REP MOVSB instruction for a particular constant size.
246 static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
247                             const SDLoc &dl, SDValue Chain, SDValue Dst,
248                             SDValue Src, uint64_t Size) {
249   return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
250                      DAG.getIntPtrConstant(Size, dl), MVT::i8);
251 }
252 
253 /// Returns a REP MOVS instruction, possibly with a few load/stores to implement
254 /// a constant size memory copy. In some cases where we know REP MOVS is
255 /// inefficient we return an empty SDValue so the calling code can either
256 /// generate a load/store sequence or call the runtime memcpy function.
257 static SDValue emitConstantSizeRepmov(
258     SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl,
259     SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
260     Align Alignment, bool isVolatile, bool AlwaysInline,
261     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
262   /// In case we optimize for size, we use repmovsb even if it's less efficient
263   /// so we can save the loads/stores of the leftover.
264   if (DAG.getMachineFunction().getFunction().hasMinSize())
265     return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
266 
267   /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
268   /// efficient.
269   if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold())
270     return SDValue();
271 
272   /// If we have enhanced repmovs we use it.
273   if (Subtarget.hasERMSB())
274     return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
275 
276   assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
277   /// We assume runtime memcpy will do a better job for unaligned copies when
278   /// ERMS is not present.
279   if (!AlwaysInline && (Alignment < Align(4)))
280     return SDValue();
281 
282   const MVT BlockType = getOptimalRepType(Subtarget, Alignment);
283   const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
284   const uint64_t BlockCount = Size / BlockBytes;
285   const uint64_t BytesLeft = Size % BlockBytes;
286   SDValue RepMovs =
287       emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
288                   DAG.getIntPtrConstant(BlockCount, dl), BlockType);
289 
290   /// RepMov can process the whole length.
291   if (BytesLeft == 0)
292     return RepMovs;
293 
294   assert(BytesLeft && "We have leftover at this point");
295 
296   // Handle the last 1 - 7 bytes.
297   SmallVector<SDValue, 4> Results;
298   Results.push_back(RepMovs);
299   unsigned Offset = Size - BytesLeft;
300   EVT DstVT = Dst.getValueType();
301   EVT SrcVT = Src.getValueType();
302   Results.push_back(DAG.getMemcpy(
303       Chain, dl,
304       DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)),
305       DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)),
306       DAG.getConstant(BytesLeft, dl, SizeVT), Alignment, isVolatile,
307       /*AlwaysInline*/ true, /*CI=*/nullptr, std::nullopt,
308       DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset)));
309   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
310 }
311 
312 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
313     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
314     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
315     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
316   // If to a segment-relative address space, use the default lowering.
317   if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256)
318     return SDValue();
319 
320   // If the base registers conflict with our physical registers, use the default
321   // lowering.
322   const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
323                                   X86::ECX, X86::ESI, X86::EDI};
324   if (isBaseRegConflictPossible(DAG, ClobberSet))
325     return SDValue();
326 
327   const X86Subtarget &Subtarget =
328       DAG.getMachineFunction().getSubtarget<X86Subtarget>();
329 
330   // If enabled and available, use fast short rep mov.
331   if (UseFSRMForMemcpy && Subtarget.hasFSRM())
332     return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
333 
334   /// Handle constant sizes
335   if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
336     return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src,
337                                   ConstantSize->getZExtValue(),
338                                   Size.getValueType(), Alignment, isVolatile,
339                                   AlwaysInline, DstPtrInfo, SrcPtrInfo);
340 
341   return SDValue();
342 }
343