1 //===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the X86SelectionDAGInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "X86SelectionDAGInfo.h" 14 #include "X86ISelLowering.h" 15 #include "X86InstrInfo.h" 16 #include "X86RegisterInfo.h" 17 #include "X86Subtarget.h" 18 #include "llvm/CodeGen/MachineFrameInfo.h" 19 #include "llvm/CodeGen/SelectionDAG.h" 20 #include "llvm/CodeGen/TargetLowering.h" 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "x86-selectiondag-info" 25 26 static cl::opt<bool> 27 UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false), 28 cl::desc("Use fast short rep mov in memcpy lowering")); 29 30 /// Returns the best type to use with repmovs/repstos depending on alignment. 31 static MVT getOptimalRepType(const X86Subtarget &Subtarget, Align Alignment) { 32 uint64_t Align = Alignment.value(); 33 assert((Align != 0) && "Align is normalized"); 34 assert(isPowerOf2_64(Align) && "Align is a power of 2"); 35 switch (Align) { 36 case 1: 37 return MVT::i8; 38 case 2: 39 return MVT::i16; 40 case 4: 41 return MVT::i32; 42 default: 43 return Subtarget.is64Bit() ? MVT::i64 : MVT::i32; 44 } 45 } 46 47 bool X86SelectionDAGInfo::isBaseRegConflictPossible( 48 SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const { 49 // We cannot use TRI->hasBasePointer() until *after* we select all basic 50 // blocks. Legalization may introduce new stack temporaries with large 51 // alignment requirements. Fall back to generic code if there are any 52 // dynamic stack adjustments (hopefully rare) and the base pointer would 53 // conflict if we had to use it. 54 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 55 if (!MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment()) 56 return false; 57 58 const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>( 59 DAG.getSubtarget().getRegisterInfo()); 60 return llvm::is_contained(ClobberSet, TRI->getBaseRegister()); 61 } 62 63 /// Emit a single REP STOSB instruction for a particular constant size. 64 static SDValue emitRepstos(const X86Subtarget &Subtarget, SelectionDAG &DAG, 65 const SDLoc &dl, SDValue Chain, SDValue Dst, 66 SDValue Val, SDValue Size, MVT AVT) { 67 const bool Use64BitRegs = Subtarget.isTarget64BitLP64(); 68 unsigned AX = X86::AL; 69 switch (AVT.getSizeInBits()) { 70 case 8: 71 AX = X86::AL; 72 break; 73 case 16: 74 AX = X86::AX; 75 break; 76 case 32: 77 AX = X86::EAX; 78 break; 79 default: 80 AX = X86::RAX; 81 break; 82 } 83 84 const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX; 85 const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI; 86 87 SDValue InGlue; 88 Chain = DAG.getCopyToReg(Chain, dl, AX, Val, InGlue); 89 InGlue = Chain.getValue(1); 90 Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InGlue); 91 InGlue = Chain.getValue(1); 92 Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InGlue); 93 InGlue = Chain.getValue(1); 94 95 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 96 SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue}; 97 return DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); 98 } 99 100 /// Emit a single REP STOSB instruction for a particular constant size. 101 static SDValue emitRepstosB(const X86Subtarget &Subtarget, SelectionDAG &DAG, 102 const SDLoc &dl, SDValue Chain, SDValue Dst, 103 SDValue Val, uint64_t Size) { 104 return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val, 105 DAG.getIntPtrConstant(Size, dl), MVT::i8); 106 } 107 108 /// Returns a REP STOS instruction, possibly with a few load/stores to implement 109 /// a constant size memory set. In some cases where we know REP MOVS is 110 /// inefficient we return an empty SDValue so the calling code can either 111 /// generate a store sequence or call the runtime memset function. 112 static SDValue emitConstantSizeRepstos(SelectionDAG &DAG, 113 const X86Subtarget &Subtarget, 114 const SDLoc &dl, SDValue Chain, 115 SDValue Dst, SDValue Val, uint64_t Size, 116 EVT SizeVT, Align Alignment, 117 bool isVolatile, bool AlwaysInline, 118 MachinePointerInfo DstPtrInfo) { 119 /// In case we optimize for size, we use repstosb even if it's less efficient 120 /// so we can save the loads/stores of the leftover. 121 if (DAG.getMachineFunction().getFunction().hasMinSize()) { 122 if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) { 123 // Special case 0 because otherwise we get large literals, 124 // which causes larger encoding. 125 if ((Size & 31) == 0 && (ValC->getZExtValue() & 255) == 0) { 126 MVT BlockType = MVT::i32; 127 const uint64_t BlockBits = BlockType.getSizeInBits(); 128 const uint64_t BlockBytes = BlockBits / 8; 129 const uint64_t BlockCount = Size / BlockBytes; 130 131 Val = DAG.getConstant(0, dl, BlockType); 132 // repstosd is same size as repstosb 133 return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val, 134 DAG.getIntPtrConstant(BlockCount, dl), BlockType); 135 } 136 } 137 return emitRepstosB(Subtarget, DAG, dl, Chain, Dst, Val, Size); 138 } 139 140 if (Size > Subtarget.getMaxInlineSizeThreshold()) 141 return SDValue(); 142 143 // If not DWORD aligned or size is more than the threshold, call the library. 144 // The libc version is likely to be faster for these cases. It can use the 145 // address value and run time information about the CPU. 146 if (Alignment < Align(4)) 147 return SDValue(); 148 149 MVT BlockType = MVT::i8; 150 uint64_t BlockCount = Size; 151 uint64_t BytesLeft = 0; 152 153 SDValue OriginalVal = Val; 154 if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) { 155 BlockType = getOptimalRepType(Subtarget, Alignment); 156 uint64_t Value = ValC->getZExtValue() & 255; 157 const uint64_t BlockBits = BlockType.getSizeInBits(); 158 159 if (BlockBits >= 16) 160 Value = (Value << 8) | Value; 161 162 if (BlockBits >= 32) 163 Value = (Value << 16) | Value; 164 165 if (BlockBits >= 64) 166 Value = (Value << 32) | Value; 167 168 const uint64_t BlockBytes = BlockBits / 8; 169 BlockCount = Size / BlockBytes; 170 BytesLeft = Size % BlockBytes; 171 Val = DAG.getConstant(Value, dl, BlockType); 172 } 173 174 SDValue RepStos = 175 emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val, 176 DAG.getIntPtrConstant(BlockCount, dl), BlockType); 177 /// RepStos can process the whole length. 178 if (BytesLeft == 0) 179 return RepStos; 180 181 // Handle the last 1 - 7 bytes. 182 SmallVector<SDValue, 4> Results; 183 Results.push_back(RepStos); 184 unsigned Offset = Size - BytesLeft; 185 EVT AddrVT = Dst.getValueType(); 186 187 Results.push_back( 188 DAG.getMemset(Chain, dl, 189 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 190 DAG.getConstant(Offset, dl, AddrVT)), 191 OriginalVal, DAG.getConstant(BytesLeft, dl, SizeVT), 192 Alignment, isVolatile, AlwaysInline, 193 /* CI */ nullptr, DstPtrInfo.getWithOffset(Offset))); 194 195 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results); 196 } 197 198 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( 199 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val, 200 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, 201 MachinePointerInfo DstPtrInfo) const { 202 // If to a segment-relative address space, use the default lowering. 203 if (DstPtrInfo.getAddrSpace() >= 256) 204 return SDValue(); 205 206 // If the base register might conflict with our physical registers, bail out. 207 const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI, 208 X86::ECX, X86::EAX, X86::EDI}; 209 if (isBaseRegConflictPossible(DAG, ClobberSet)) 210 return SDValue(); 211 212 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 213 if (!ConstantSize) 214 return SDValue(); 215 216 const X86Subtarget &Subtarget = 217 DAG.getMachineFunction().getSubtarget<X86Subtarget>(); 218 return emitConstantSizeRepstos( 219 DAG, Subtarget, dl, Chain, Dst, Val, ConstantSize->getZExtValue(), 220 Size.getValueType(), Alignment, isVolatile, AlwaysInline, DstPtrInfo); 221 } 222 223 /// Emit a single REP MOVS{B,W,D,Q} instruction. 224 static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG, 225 const SDLoc &dl, SDValue Chain, SDValue Dst, 226 SDValue Src, SDValue Size, MVT AVT) { 227 const bool Use64BitRegs = Subtarget.isTarget64BitLP64(); 228 const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX; 229 const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI; 230 const unsigned SI = Use64BitRegs ? X86::RSI : X86::ESI; 231 232 SDValue InGlue; 233 Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InGlue); 234 InGlue = Chain.getValue(1); 235 Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InGlue); 236 InGlue = Chain.getValue(1); 237 Chain = DAG.getCopyToReg(Chain, dl, SI, Src, InGlue); 238 InGlue = Chain.getValue(1); 239 240 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 241 SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue}; 242 return DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops); 243 } 244 245 /// Emit a single REP MOVSB instruction for a particular constant size. 246 static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG, 247 const SDLoc &dl, SDValue Chain, SDValue Dst, 248 SDValue Src, uint64_t Size) { 249 return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, 250 DAG.getIntPtrConstant(Size, dl), MVT::i8); 251 } 252 253 /// Returns a REP MOVS instruction, possibly with a few load/stores to implement 254 /// a constant size memory copy. In some cases where we know REP MOVS is 255 /// inefficient we return an empty SDValue so the calling code can either 256 /// generate a load/store sequence or call the runtime memcpy function. 257 static SDValue emitConstantSizeRepmov( 258 SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, 259 SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT, 260 Align Alignment, bool isVolatile, bool AlwaysInline, 261 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) { 262 /// In case we optimize for size, we use repmovsb even if it's less efficient 263 /// so we can save the loads/stores of the leftover. 264 if (DAG.getMachineFunction().getFunction().hasMinSize()) 265 return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size); 266 267 /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very 268 /// efficient. 269 if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold()) 270 return SDValue(); 271 272 /// If we have enhanced repmovs we use it. 273 if (Subtarget.hasERMSB()) 274 return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size); 275 276 assert(!Subtarget.hasERMSB() && "No efficient RepMovs"); 277 /// We assume runtime memcpy will do a better job for unaligned copies when 278 /// ERMS is not present. 279 if (!AlwaysInline && (Alignment < Align(4))) 280 return SDValue(); 281 282 const MVT BlockType = getOptimalRepType(Subtarget, Alignment); 283 const uint64_t BlockBytes = BlockType.getSizeInBits() / 8; 284 const uint64_t BlockCount = Size / BlockBytes; 285 const uint64_t BytesLeft = Size % BlockBytes; 286 SDValue RepMovs = 287 emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, 288 DAG.getIntPtrConstant(BlockCount, dl), BlockType); 289 290 /// RepMov can process the whole length. 291 if (BytesLeft == 0) 292 return RepMovs; 293 294 assert(BytesLeft && "We have leftover at this point"); 295 296 // Handle the last 1 - 7 bytes. 297 SmallVector<SDValue, 4> Results; 298 Results.push_back(RepMovs); 299 unsigned Offset = Size - BytesLeft; 300 EVT DstVT = Dst.getValueType(); 301 EVT SrcVT = Src.getValueType(); 302 Results.push_back(DAG.getMemcpy( 303 Chain, dl, 304 DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)), 305 DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)), 306 DAG.getConstant(BytesLeft, dl, SizeVT), Alignment, isVolatile, 307 /*AlwaysInline*/ true, /*CI=*/nullptr, std::nullopt, 308 DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset))); 309 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results); 310 } 311 312 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( 313 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, 314 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, 315 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { 316 // If to a segment-relative address space, use the default lowering. 317 if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256) 318 return SDValue(); 319 320 // If the base registers conflict with our physical registers, use the default 321 // lowering. 322 const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI, 323 X86::ECX, X86::ESI, X86::EDI}; 324 if (isBaseRegConflictPossible(DAG, ClobberSet)) 325 return SDValue(); 326 327 const X86Subtarget &Subtarget = 328 DAG.getMachineFunction().getSubtarget<X86Subtarget>(); 329 330 // If enabled and available, use fast short rep mov. 331 if (UseFSRMForMemcpy && Subtarget.hasFSRM()) 332 return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8); 333 334 /// Handle constant sizes 335 if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size)) 336 return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src, 337 ConstantSize->getZExtValue(), 338 Size.getValueType(), Alignment, isVolatile, 339 AlwaysInline, DstPtrInfo, SrcPtrInfo); 340 341 return SDValue(); 342 } 343