xref: /llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp (revision 9ae92d70561bcc95a7f818920238e764253d9758)
1 //===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the X86SelectionDAGInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "X86SelectionDAGInfo.h"
14 #include "X86ISelLowering.h"
15 #include "X86InstrInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/SelectionDAG.h"
20 #include "llvm/CodeGen/TargetLowering.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "x86-selectiondag-info"
25 
26 static cl::opt<bool>
27     UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
28                      cl::desc("Use fast short rep mov in memcpy lowering"));
29 
30 bool X86SelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
31   return Opcode >= X86ISD::FIRST_MEMORY_OPCODE &&
32          Opcode <= X86ISD::LAST_MEMORY_OPCODE;
33 }
34 
35 bool X86SelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode) const {
36   return Opcode >= X86ISD::FIRST_STRICTFP_OPCODE &&
37          Opcode <= X86ISD::LAST_STRICTFP_OPCODE;
38 }
39 
40 /// Returns the best type to use with repmovs/repstos depending on alignment.
41 static MVT getOptimalRepType(const X86Subtarget &Subtarget, Align Alignment) {
42   uint64_t Align = Alignment.value();
43   assert((Align != 0) && "Align is normalized");
44   assert(isPowerOf2_64(Align) && "Align is a power of 2");
45   switch (Align) {
46   case 1:
47     return MVT::i8;
48   case 2:
49     return MVT::i16;
50   case 4:
51     return MVT::i32;
52   default:
53     return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
54   }
55 }
56 
57 bool X86SelectionDAGInfo::isBaseRegConflictPossible(
58     SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
59   // We cannot use TRI->hasBasePointer() until *after* we select all basic
60   // blocks.  Legalization may introduce new stack temporaries with large
61   // alignment requirements.  Fall back to generic code if there are any
62   // dynamic stack adjustments (hopefully rare) and the base pointer would
63   // conflict if we had to use it.
64   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
65   if (!MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment())
66     return false;
67 
68   const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
69       DAG.getSubtarget().getRegisterInfo());
70   return llvm::is_contained(ClobberSet, TRI->getBaseRegister());
71 }
72 
73 /// Emit a single REP STOSB instruction for a particular constant size.
74 static SDValue emitRepstos(const X86Subtarget &Subtarget, SelectionDAG &DAG,
75                            const SDLoc &dl, SDValue Chain, SDValue Dst,
76                            SDValue Val, SDValue Size, MVT AVT) {
77   const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
78   unsigned AX = X86::AL;
79   switch (AVT.getSizeInBits()) {
80   case 8:
81     AX = X86::AL;
82     break;
83   case 16:
84     AX = X86::AX;
85     break;
86   case 32:
87     AX = X86::EAX;
88     break;
89   default:
90     AX = X86::RAX;
91     break;
92   }
93 
94   const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
95   const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
96 
97   SDValue InGlue;
98   Chain = DAG.getCopyToReg(Chain, dl, AX, Val, InGlue);
99   InGlue = Chain.getValue(1);
100   Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InGlue);
101   InGlue = Chain.getValue(1);
102   Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InGlue);
103   InGlue = Chain.getValue(1);
104 
105   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
106   SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
107   return DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
108 }
109 
110 /// Emit a single REP STOSB instruction for a particular constant size.
111 static SDValue emitRepstosB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
112                             const SDLoc &dl, SDValue Chain, SDValue Dst,
113                             SDValue Val, uint64_t Size) {
114   return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
115                      DAG.getIntPtrConstant(Size, dl), MVT::i8);
116 }
117 
118 /// Returns a REP STOS instruction, possibly with a few load/stores to implement
119 /// a constant size memory set. In some cases where we know REP MOVS is
120 /// inefficient we return an empty SDValue so the calling code can either
121 /// generate a store sequence or call the runtime memset function.
122 static SDValue emitConstantSizeRepstos(SelectionDAG &DAG,
123                                        const X86Subtarget &Subtarget,
124                                        const SDLoc &dl, SDValue Chain,
125                                        SDValue Dst, SDValue Val, uint64_t Size,
126                                        EVT SizeVT, Align Alignment,
127                                        bool isVolatile, bool AlwaysInline,
128                                        MachinePointerInfo DstPtrInfo) {
129   /// In case we optimize for size, we use repstosb even if it's less efficient
130   /// so we can save the loads/stores of the leftover.
131   if (DAG.getMachineFunction().getFunction().hasMinSize()) {
132     if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
133       // Special case 0 because otherwise we get large literals,
134       // which causes larger encoding.
135       if ((Size & 31) == 0 && (ValC->getZExtValue() & 255) == 0) {
136         MVT BlockType = MVT::i32;
137         const uint64_t BlockBits = BlockType.getSizeInBits();
138         const uint64_t BlockBytes = BlockBits / 8;
139         const uint64_t BlockCount = Size / BlockBytes;
140 
141         Val = DAG.getConstant(0, dl, BlockType);
142         // repstosd is same size as repstosb
143         return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
144                            DAG.getIntPtrConstant(BlockCount, dl), BlockType);
145       }
146     }
147     return emitRepstosB(Subtarget, DAG, dl, Chain, Dst, Val, Size);
148   }
149 
150   if (Size > Subtarget.getMaxInlineSizeThreshold())
151     return SDValue();
152 
153   // If not DWORD aligned or size is more than the threshold, call the library.
154   // The libc version is likely to be faster for these cases. It can use the
155   // address value and run time information about the CPU.
156   if (Alignment < Align(4))
157     return SDValue();
158 
159   MVT BlockType = MVT::i8;
160   uint64_t BlockCount = Size;
161   uint64_t BytesLeft = 0;
162 
163   SDValue OriginalVal = Val;
164   if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
165     BlockType = getOptimalRepType(Subtarget, Alignment);
166     uint64_t Value = ValC->getZExtValue() & 255;
167     const uint64_t BlockBits = BlockType.getSizeInBits();
168 
169     if (BlockBits >= 16)
170       Value = (Value << 8) | Value;
171 
172     if (BlockBits >= 32)
173       Value = (Value << 16) | Value;
174 
175     if (BlockBits >= 64)
176       Value = (Value << 32) | Value;
177 
178     const uint64_t BlockBytes = BlockBits / 8;
179     BlockCount = Size / BlockBytes;
180     BytesLeft = Size % BlockBytes;
181     Val = DAG.getConstant(Value, dl, BlockType);
182   }
183 
184   SDValue RepStos =
185       emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
186                   DAG.getIntPtrConstant(BlockCount, dl), BlockType);
187   /// RepStos can process the whole length.
188   if (BytesLeft == 0)
189     return RepStos;
190 
191   // Handle the last 1 - 7 bytes.
192   SmallVector<SDValue, 4> Results;
193   Results.push_back(RepStos);
194   unsigned Offset = Size - BytesLeft;
195   EVT AddrVT = Dst.getValueType();
196 
197   Results.push_back(
198       DAG.getMemset(Chain, dl,
199                     DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
200                                 DAG.getConstant(Offset, dl, AddrVT)),
201                     OriginalVal, DAG.getConstant(BytesLeft, dl, SizeVT),
202                     Alignment, isVolatile, AlwaysInline,
203                     /* CI */ nullptr, DstPtrInfo.getWithOffset(Offset)));
204 
205   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
206 }
207 
208 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
209     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
210     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
211     MachinePointerInfo DstPtrInfo) const {
212   // If to a segment-relative address space, use the default lowering.
213   if (DstPtrInfo.getAddrSpace() >= 256)
214     return SDValue();
215 
216   // If the base register might conflict with our physical registers, bail out.
217   const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
218                                   X86::ECX, X86::EAX, X86::EDI};
219   if (isBaseRegConflictPossible(DAG, ClobberSet))
220     return SDValue();
221 
222   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
223   if (!ConstantSize)
224     return SDValue();
225 
226   const X86Subtarget &Subtarget =
227       DAG.getMachineFunction().getSubtarget<X86Subtarget>();
228   return emitConstantSizeRepstos(
229       DAG, Subtarget, dl, Chain, Dst, Val, ConstantSize->getZExtValue(),
230       Size.getValueType(), Alignment, isVolatile, AlwaysInline, DstPtrInfo);
231 }
232 
233 /// Emit a single REP MOVS{B,W,D,Q} instruction.
234 static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
235                            const SDLoc &dl, SDValue Chain, SDValue Dst,
236                            SDValue Src, SDValue Size, MVT AVT) {
237   const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
238   const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
239   const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
240   const unsigned SI = Use64BitRegs ? X86::RSI : X86::ESI;
241 
242   SDValue InGlue;
243   Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InGlue);
244   InGlue = Chain.getValue(1);
245   Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InGlue);
246   InGlue = Chain.getValue(1);
247   Chain = DAG.getCopyToReg(Chain, dl, SI, Src, InGlue);
248   InGlue = Chain.getValue(1);
249 
250   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
251   SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
252   return DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
253 }
254 
255 /// Emit a single REP MOVSB instruction for a particular constant size.
256 static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
257                             const SDLoc &dl, SDValue Chain, SDValue Dst,
258                             SDValue Src, uint64_t Size) {
259   return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
260                      DAG.getIntPtrConstant(Size, dl), MVT::i8);
261 }
262 
263 /// Returns a REP MOVS instruction, possibly with a few load/stores to implement
264 /// a constant size memory copy. In some cases where we know REP MOVS is
265 /// inefficient we return an empty SDValue so the calling code can either
266 /// generate a load/store sequence or call the runtime memcpy function.
267 static SDValue emitConstantSizeRepmov(
268     SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl,
269     SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
270     Align Alignment, bool isVolatile, bool AlwaysInline,
271     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
272   /// In case we optimize for size, we use repmovsb even if it's less efficient
273   /// so we can save the loads/stores of the leftover.
274   if (DAG.getMachineFunction().getFunction().hasMinSize())
275     return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
276 
277   /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
278   /// efficient.
279   if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold())
280     return SDValue();
281 
282   /// If we have enhanced repmovs we use it.
283   if (Subtarget.hasERMSB())
284     return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
285 
286   assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
287   /// We assume runtime memcpy will do a better job for unaligned copies when
288   /// ERMS is not present.
289   if (!AlwaysInline && (Alignment < Align(4)))
290     return SDValue();
291 
292   const MVT BlockType = getOptimalRepType(Subtarget, Alignment);
293   const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
294   const uint64_t BlockCount = Size / BlockBytes;
295   const uint64_t BytesLeft = Size % BlockBytes;
296   SDValue RepMovs =
297       emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
298                   DAG.getIntPtrConstant(BlockCount, dl), BlockType);
299 
300   /// RepMov can process the whole length.
301   if (BytesLeft == 0)
302     return RepMovs;
303 
304   assert(BytesLeft && "We have leftover at this point");
305 
306   // Handle the last 1 - 7 bytes.
307   SmallVector<SDValue, 4> Results;
308   Results.push_back(RepMovs);
309   unsigned Offset = Size - BytesLeft;
310   EVT DstVT = Dst.getValueType();
311   EVT SrcVT = Src.getValueType();
312   Results.push_back(DAG.getMemcpy(
313       Chain, dl,
314       DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)),
315       DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)),
316       DAG.getConstant(BytesLeft, dl, SizeVT), Alignment, isVolatile,
317       /*AlwaysInline*/ true, /*CI=*/nullptr, std::nullopt,
318       DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset)));
319   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
320 }
321 
322 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
323     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
324     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
325     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
326   // If to a segment-relative address space, use the default lowering.
327   if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256)
328     return SDValue();
329 
330   // If the base registers conflict with our physical registers, use the default
331   // lowering.
332   const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
333                                   X86::ECX, X86::ESI, X86::EDI};
334   if (isBaseRegConflictPossible(DAG, ClobberSet))
335     return SDValue();
336 
337   const X86Subtarget &Subtarget =
338       DAG.getMachineFunction().getSubtarget<X86Subtarget>();
339 
340   // If enabled and available, use fast short rep mov.
341   if (UseFSRMForMemcpy && Subtarget.hasFSRM())
342     return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
343 
344   /// Handle constant sizes
345   if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
346     return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src,
347                                   ConstantSize->getZExtValue(),
348                                   Size.getValueType(), Alignment, isVolatile,
349                                   AlwaysInline, DstPtrInfo, SrcPtrInfo);
350 
351   return SDValue();
352 }
353