1 //===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file implements the lowering of LLVM calls to DAG nodes. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "X86.h" 15 #include "X86CallingConv.h" 16 #include "X86FrameLowering.h" 17 #include "X86ISelLowering.h" 18 #include "X86InstrBuilder.h" 19 #include "X86MachineFunctionInfo.h" 20 #include "X86TargetMachine.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/Analysis/ObjCARCUtil.h" 23 #include "llvm/CodeGen/MachineJumpTableInfo.h" 24 #include "llvm/CodeGen/MachineModuleInfo.h" 25 #include "llvm/CodeGen/WinEHFuncInfo.h" 26 #include "llvm/IR/DiagnosticInfo.h" 27 #include "llvm/IR/IRBuilder.h" 28 #include "llvm/IR/Module.h" 29 30 #define DEBUG_TYPE "x86-isel" 31 32 using namespace llvm; 33 34 STATISTIC(NumTailCalls, "Number of tail calls"); 35 36 /// Call this when the user attempts to do something unsupported, like 37 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike 38 /// report_fatal_error, so calling code should attempt to recover without 39 /// crashing. 40 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, 41 const char *Msg) { 42 MachineFunction &MF = DAG.getMachineFunction(); 43 DAG.getContext()->diagnose( 44 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc())); 45 } 46 47 /// Returns true if a CC can dynamically exclude a register from the list of 48 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on 49 /// the return registers. 50 static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) { 51 switch (CC) { 52 default: 53 return false; 54 case CallingConv::X86_RegCall: 55 case CallingConv::PreserveMost: 56 case CallingConv::PreserveAll: 57 return true; 58 } 59 } 60 61 /// Returns true if a CC can dynamically exclude a register from the list of 62 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on 63 /// the parameters. 64 static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) { 65 return CC == CallingConv::X86_RegCall; 66 } 67 68 static std::pair<MVT, unsigned> 69 handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC, 70 const X86Subtarget &Subtarget) { 71 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling 72 // convention is one that uses k registers. 73 if (NumElts == 2) 74 return {MVT::v2i64, 1}; 75 if (NumElts == 4) 76 return {MVT::v4i32, 1}; 77 if (NumElts == 8 && CC != CallingConv::X86_RegCall && 78 CC != CallingConv::Intel_OCL_BI) 79 return {MVT::v8i16, 1}; 80 if (NumElts == 16 && CC != CallingConv::X86_RegCall && 81 CC != CallingConv::Intel_OCL_BI) 82 return {MVT::v16i8, 1}; 83 // v32i1 passes in ymm unless we have BWI and the calling convention is 84 // regcall. 85 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall)) 86 return {MVT::v32i8, 1}; 87 // Split v64i1 vectors if we don't have v64i8 available. 88 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) { 89 if (Subtarget.useAVX512Regs()) 90 return {MVT::v64i8, 1}; 91 return {MVT::v32i8, 2}; 92 } 93 94 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. 95 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) || 96 NumElts > 64) 97 return {MVT::i8, NumElts}; 98 99 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0}; 100 } 101 102 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, 103 CallingConv::ID CC, 104 EVT VT) const { 105 if (VT.isVector()) { 106 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { 107 unsigned NumElts = VT.getVectorNumElements(); 108 109 MVT RegisterVT; 110 unsigned NumRegisters; 111 std::tie(RegisterVT, NumRegisters) = 112 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); 113 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) 114 return RegisterVT; 115 } 116 117 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) 118 return MVT::v8f16; 119 } 120 121 // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled. 122 if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() && 123 !Subtarget.hasX87()) 124 return MVT::i32; 125 126 if (isTypeLegal(MVT::f16)) { 127 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) 128 return getRegisterTypeForCallingConv( 129 Context, CC, VT.changeVectorElementType(MVT::f16)); 130 131 if (VT == MVT::bf16) 132 return MVT::f16; 133 } 134 135 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); 136 } 137 138 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, 139 CallingConv::ID CC, 140 EVT VT) const { 141 if (VT.isVector()) { 142 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { 143 unsigned NumElts = VT.getVectorNumElements(); 144 145 MVT RegisterVT; 146 unsigned NumRegisters; 147 std::tie(RegisterVT, NumRegisters) = 148 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); 149 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) 150 return NumRegisters; 151 } 152 153 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) 154 return 1; 155 } 156 157 // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if 158 // x87 is disabled. 159 if (!Subtarget.is64Bit() && !Subtarget.hasX87()) { 160 if (VT == MVT::f64) 161 return 2; 162 if (VT == MVT::f80) 163 return 3; 164 } 165 166 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 && 167 isTypeLegal(MVT::f16)) 168 return getNumRegistersForCallingConv(Context, CC, 169 VT.changeVectorElementType(MVT::f16)); 170 171 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); 172 } 173 174 unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( 175 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, 176 unsigned &NumIntermediates, MVT &RegisterVT) const { 177 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. 178 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && 179 Subtarget.hasAVX512() && 180 (!isPowerOf2_32(VT.getVectorNumElements()) || 181 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) || 182 VT.getVectorNumElements() > 64)) { 183 RegisterVT = MVT::i8; 184 IntermediateVT = MVT::i1; 185 NumIntermediates = VT.getVectorNumElements(); 186 return NumIntermediates; 187 } 188 189 // Split v64i1 vectors if we don't have v64i8 available. 190 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && 191 CC != CallingConv::X86_RegCall) { 192 RegisterVT = MVT::v32i8; 193 IntermediateVT = MVT::v32i1; 194 NumIntermediates = 2; 195 return 2; 196 } 197 198 // Split vNbf16 vectors according to vNf16. 199 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 && 200 isTypeLegal(MVT::f16)) 201 VT = VT.changeVectorElementType(MVT::f16); 202 203 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, 204 NumIntermediates, RegisterVT); 205 } 206 207 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, 208 LLVMContext& Context, 209 EVT VT) const { 210 if (!VT.isVector()) 211 return MVT::i8; 212 213 if (Subtarget.hasAVX512()) { 214 // Figure out what this type will be legalized to. 215 EVT LegalVT = VT; 216 while (getTypeAction(Context, LegalVT) != TypeLegal) 217 LegalVT = getTypeToTransformTo(Context, LegalVT); 218 219 // If we got a 512-bit vector then we'll definitely have a vXi1 compare. 220 if (LegalVT.getSimpleVT().is512BitVector()) 221 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount()); 222 223 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) { 224 // If we legalized to less than a 512-bit vector, then we will use a vXi1 225 // compare for vXi32/vXi64 for sure. If we have BWI we will also support 226 // vXi16/vXi8. 227 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType(); 228 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32) 229 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount()); 230 } 231 } 232 233 return VT.changeVectorElementTypeToInteger(); 234 } 235 236 bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters( 237 Type *Ty, CallingConv::ID CallConv, bool isVarArg, 238 const DataLayout &DL) const { 239 // i128 split into i64 needs to be allocated to two consecutive registers, 240 // or spilled to the stack as a whole. 241 return Ty->isIntegerTy(128); 242 } 243 244 /// Helper for getByValTypeAlignment to determine 245 /// the desired ByVal argument alignment. 246 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) { 247 if (MaxAlign == 16) 248 return; 249 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 250 if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128) 251 MaxAlign = Align(16); 252 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 253 Align EltAlign; 254 getMaxByValAlign(ATy->getElementType(), EltAlign); 255 if (EltAlign > MaxAlign) 256 MaxAlign = EltAlign; 257 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 258 for (auto *EltTy : STy->elements()) { 259 Align EltAlign; 260 getMaxByValAlign(EltTy, EltAlign); 261 if (EltAlign > MaxAlign) 262 MaxAlign = EltAlign; 263 if (MaxAlign == 16) 264 break; 265 } 266 } 267 } 268 269 /// Return the desired alignment for ByVal aggregate 270 /// function arguments in the caller parameter area. For X86, aggregates 271 /// that contain SSE vectors are placed at 16-byte boundaries while the rest 272 /// are at 4-byte boundaries. 273 Align X86TargetLowering::getByValTypeAlignment(Type *Ty, 274 const DataLayout &DL) const { 275 if (Subtarget.is64Bit()) 276 return std::max(DL.getABITypeAlign(Ty), Align::Constant<8>()); 277 278 Align Alignment(4); 279 if (Subtarget.hasSSE1()) 280 getMaxByValAlign(Ty, Alignment); 281 return Alignment; 282 } 283 284 /// It returns EVT::Other if the type should be determined using generic 285 /// target-independent logic. 286 /// For vector ops we check that the overall size isn't larger than our 287 /// preferred vector width. 288 EVT X86TargetLowering::getOptimalMemOpType( 289 const MemOp &Op, const AttributeList &FuncAttributes) const { 290 if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { 291 if (Op.size() >= 16 && 292 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) { 293 // FIXME: Check if unaligned 64-byte accesses are slow. 294 if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() && 295 (Subtarget.getPreferVectorWidth() >= 512)) { 296 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; 297 } 298 // FIXME: Check if unaligned 32-byte accesses are slow. 299 if (Op.size() >= 32 && Subtarget.hasAVX() && 300 Subtarget.useLight256BitInstructions()) { 301 // Although this isn't a well-supported type for AVX1, we'll let 302 // legalization and shuffle lowering produce the optimal codegen. If we 303 // choose an optimal type with a vector element larger than a byte, 304 // getMemsetStores() may create an intermediate splat (using an integer 305 // multiply) before we splat as a vector. 306 return MVT::v32i8; 307 } 308 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128)) 309 return MVT::v16i8; 310 // TODO: Can SSE1 handle a byte vector? 311 // If we have SSE1 registers we should be able to use them. 312 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && 313 (Subtarget.getPreferVectorWidth() >= 128)) 314 return MVT::v4f32; 315 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) && 316 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { 317 // Do not use f64 to lower memcpy if source is string constant. It's 318 // better to use i32 to avoid the loads. 319 // Also, do not use f64 to lower memset unless this is a memset of zeros. 320 // The gymnastics of splatting a byte value into an XMM register and then 321 // only using 8-byte stores (because this is a CPU with slow unaligned 322 // 16-byte accesses) makes that a loser. 323 return MVT::f64; 324 } 325 } 326 // This is a compromise. If we reach here, unaligned accesses may be slow on 327 // this target. However, creating smaller, aligned accesses could be even 328 // slower and would certainly be a lot more code. 329 if (Subtarget.is64Bit() && Op.size() >= 8) 330 return MVT::i64; 331 return MVT::i32; 332 } 333 334 bool X86TargetLowering::isSafeMemOpType(MVT VT) const { 335 if (VT == MVT::f32) 336 return Subtarget.hasSSE1(); 337 if (VT == MVT::f64) 338 return Subtarget.hasSSE2(); 339 return true; 340 } 341 342 static bool isBitAligned(Align Alignment, uint64_t SizeInBits) { 343 return (8 * Alignment.value()) % SizeInBits == 0; 344 } 345 346 bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const { 347 if (isBitAligned(Alignment, VT.getSizeInBits())) 348 return true; 349 switch (VT.getSizeInBits()) { 350 default: 351 // 8-byte and under are always assumed to be fast. 352 return true; 353 case 128: 354 return !Subtarget.isUnalignedMem16Slow(); 355 case 256: 356 return !Subtarget.isUnalignedMem32Slow(); 357 // TODO: What about AVX-512 (512-bit) accesses? 358 } 359 } 360 361 bool X86TargetLowering::allowsMisalignedMemoryAccesses( 362 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags, 363 unsigned *Fast) const { 364 if (Fast) 365 *Fast = isMemoryAccessFast(VT, Alignment); 366 // NonTemporal vector memory ops must be aligned. 367 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { 368 // NT loads can only be vector aligned, so if its less aligned than the 369 // minimum vector size (which we can split the vector down to), we might as 370 // well use a regular unaligned vector load. 371 // We don't have any NT loads pre-SSE41. 372 if (!!(Flags & MachineMemOperand::MOLoad)) 373 return (Alignment < 16 || !Subtarget.hasSSE41()); 374 return false; 375 } 376 // Misaligned accesses of any size are always allowed. 377 return true; 378 } 379 380 bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context, 381 const DataLayout &DL, EVT VT, 382 unsigned AddrSpace, Align Alignment, 383 MachineMemOperand::Flags Flags, 384 unsigned *Fast) const { 385 if (Fast) 386 *Fast = isMemoryAccessFast(VT, Alignment); 387 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { 388 if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags, 389 /*Fast=*/nullptr)) 390 return true; 391 // NonTemporal vector memory ops are special, and must be aligned. 392 if (!isBitAligned(Alignment, VT.getSizeInBits())) 393 return false; 394 switch (VT.getSizeInBits()) { 395 case 128: 396 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41()) 397 return true; 398 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2()) 399 return true; 400 return false; 401 case 256: 402 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2()) 403 return true; 404 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX()) 405 return true; 406 return false; 407 case 512: 408 if (Subtarget.hasAVX512() && Subtarget.hasEVEX512()) 409 return true; 410 return false; 411 default: 412 return false; // Don't have NonTemporal vector memory ops of this size. 413 } 414 } 415 return true; 416 } 417 418 /// Return the entry encoding for a jump table in the 419 /// current function. The returned value is a member of the 420 /// MachineJumpTableInfo::JTEntryKind enum. 421 unsigned X86TargetLowering::getJumpTableEncoding() const { 422 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 423 // symbol. 424 if (isPositionIndependent() && Subtarget.isPICStyleGOT()) 425 return MachineJumpTableInfo::EK_Custom32; 426 if (isPositionIndependent() && 427 getTargetMachine().getCodeModel() == CodeModel::Large && 428 !Subtarget.isTargetCOFF()) 429 return MachineJumpTableInfo::EK_LabelDifference64; 430 431 // Otherwise, use the normal jump table encoding heuristics. 432 return TargetLowering::getJumpTableEncoding(); 433 } 434 435 bool X86TargetLowering::useSoftFloat() const { 436 return Subtarget.useSoftFloat(); 437 } 438 439 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, 440 ArgListTy &Args) const { 441 442 // Only relabel X86-32 for C / Stdcall CCs. 443 if (Subtarget.is64Bit()) 444 return; 445 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) 446 return; 447 unsigned ParamRegs = 0; 448 if (auto *M = MF->getFunction().getParent()) 449 ParamRegs = M->getNumberRegisterParameters(); 450 451 // Mark the first N int arguments as having reg 452 for (auto &Arg : Args) { 453 Type *T = Arg.Ty; 454 if (T->isIntOrPtrTy()) 455 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) { 456 unsigned numRegs = 1; 457 if (MF->getDataLayout().getTypeAllocSize(T) > 4) 458 numRegs = 2; 459 if (ParamRegs < numRegs) 460 return; 461 ParamRegs -= numRegs; 462 Arg.IsInReg = true; 463 } 464 } 465 } 466 467 const MCExpr * 468 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 469 const MachineBasicBlock *MBB, 470 unsigned uid,MCContext &Ctx) const{ 471 assert(isPositionIndependent() && Subtarget.isPICStyleGOT()); 472 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 473 // entries. 474 return MCSymbolRefExpr::create(MBB->getSymbol(), 475 MCSymbolRefExpr::VK_GOTOFF, Ctx); 476 } 477 478 /// Returns relocation base for the given PIC jumptable. 479 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 480 SelectionDAG &DAG) const { 481 if (!Subtarget.is64Bit()) 482 // This doesn't have SDLoc associated with it, but is not really the 483 // same as a Register. 484 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), 485 getPointerTy(DAG.getDataLayout())); 486 return Table; 487 } 488 489 /// This returns the relocation base for the given PIC jumptable, 490 /// the same as getPICJumpTableRelocBase, but as an MCExpr. 491 const MCExpr *X86TargetLowering:: 492 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 493 MCContext &Ctx) const { 494 // X86-64 uses RIP relative addressing based on the jump table label. 495 if (Subtarget.isPICStyleRIPRel() || 496 (Subtarget.is64Bit() && 497 getTargetMachine().getCodeModel() == CodeModel::Large)) 498 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 499 500 // Otherwise, the reference is relative to the PIC base. 501 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); 502 } 503 504 std::pair<const TargetRegisterClass *, uint8_t> 505 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 506 MVT VT) const { 507 const TargetRegisterClass *RRC = nullptr; 508 uint8_t Cost = 1; 509 switch (VT.SimpleTy) { 510 default: 511 return TargetLowering::findRepresentativeClass(TRI, VT); 512 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 513 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; 514 break; 515 case MVT::x86mmx: 516 RRC = &X86::VR64RegClass; 517 break; 518 case MVT::f32: case MVT::f64: 519 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 520 case MVT::v4f32: case MVT::v2f64: 521 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: 522 case MVT::v8f32: case MVT::v4f64: 523 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: 524 case MVT::v16f32: case MVT::v8f64: 525 RRC = &X86::VR128XRegClass; 526 break; 527 } 528 return std::make_pair(RRC, Cost); 529 } 530 531 unsigned X86TargetLowering::getAddressSpace() const { 532 if (Subtarget.is64Bit()) 533 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? X86AS::GS 534 : X86AS::FS; 535 return X86AS::GS; 536 } 537 538 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { 539 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || 540 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); 541 } 542 543 static Constant* SegmentOffset(IRBuilderBase &IRB, 544 int Offset, unsigned AddressSpace) { 545 return ConstantExpr::getIntToPtr( 546 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), 547 IRB.getPtrTy(AddressSpace)); 548 } 549 550 Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { 551 // glibc, bionic, and Fuchsia have a special slot for the stack guard in 552 // tcbhead_t; use it instead of the usual global variable (see 553 // sysdeps/{i386,x86_64}/nptl/tls.h) 554 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { 555 unsigned AddressSpace = getAddressSpace(); 556 557 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value. 558 if (Subtarget.isTargetFuchsia()) 559 return SegmentOffset(IRB, 0x10, AddressSpace); 560 561 Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 562 // Specially, some users may customize the base reg and offset. 563 int Offset = M->getStackProtectorGuardOffset(); 564 // If we don't set -stack-protector-guard-offset value: 565 // %fs:0x28, unless we're using a Kernel code model, in which case 566 // it's %gs:0x28. gs:0x14 on i386. 567 if (Offset == INT_MAX) 568 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; 569 570 StringRef GuardReg = M->getStackProtectorGuardReg(); 571 if (GuardReg == "fs") 572 AddressSpace = X86AS::FS; 573 else if (GuardReg == "gs") 574 AddressSpace = X86AS::GS; 575 576 // Use symbol guard if user specify. 577 StringRef GuardSymb = M->getStackProtectorGuardSymbol(); 578 if (!GuardSymb.empty()) { 579 GlobalVariable *GV = M->getGlobalVariable(GuardSymb); 580 if (!GV) { 581 Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext()) 582 : Type::getInt32Ty(M->getContext()); 583 GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, 584 nullptr, GuardSymb, nullptr, 585 GlobalValue::NotThreadLocal, AddressSpace); 586 if (!Subtarget.isTargetDarwin()) 587 GV->setDSOLocal(M->getDirectAccessExternalData()); 588 } 589 return GV; 590 } 591 592 return SegmentOffset(IRB, Offset, AddressSpace); 593 } 594 return TargetLowering::getIRStackGuard(IRB); 595 } 596 597 void X86TargetLowering::insertSSPDeclarations(Module &M) const { 598 // MSVC CRT provides functionalities for stack protection. 599 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || 600 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { 601 // MSVC CRT has a global variable holding security cookie. 602 M.getOrInsertGlobal("__security_cookie", 603 PointerType::getUnqual(M.getContext())); 604 605 // MSVC CRT has a function to validate security cookie. 606 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 607 "__security_check_cookie", Type::getVoidTy(M.getContext()), 608 PointerType::getUnqual(M.getContext())); 609 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { 610 F->setCallingConv(CallingConv::X86_FastCall); 611 F->addParamAttr(0, Attribute::AttrKind::InReg); 612 } 613 return; 614 } 615 616 StringRef GuardMode = M.getStackProtectorGuard(); 617 618 // glibc, bionic, and Fuchsia have a special slot for the stack guard. 619 if ((GuardMode == "tls" || GuardMode.empty()) && 620 hasStackGuardSlotTLS(Subtarget.getTargetTriple())) 621 return; 622 TargetLowering::insertSSPDeclarations(M); 623 } 624 625 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { 626 // MSVC CRT has a global variable holding security cookie. 627 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || 628 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { 629 return M.getGlobalVariable("__security_cookie"); 630 } 631 return TargetLowering::getSDagStackGuard(M); 632 } 633 634 Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { 635 // MSVC CRT has a function to validate security cookie. 636 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || 637 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { 638 return M.getFunction("__security_check_cookie"); 639 } 640 return TargetLowering::getSSPStackGuardCheck(M); 641 } 642 643 Value * 644 X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { 645 // Android provides a fixed TLS slot for the SafeStack pointer. See the 646 // definition of TLS_SLOT_SAFESTACK in 647 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 648 if (Subtarget.isTargetAndroid()) { 649 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: 650 // %gs:0x24 on i386 651 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; 652 return SegmentOffset(IRB, Offset, getAddressSpace()); 653 } 654 655 // Fuchsia is similar. 656 if (Subtarget.isTargetFuchsia()) { 657 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. 658 return SegmentOffset(IRB, 0x18, getAddressSpace()); 659 } 660 661 return TargetLowering::getSafeStackPointerLocation(IRB); 662 } 663 664 //===----------------------------------------------------------------------===// 665 // Return Value Calling Convention Implementation 666 //===----------------------------------------------------------------------===// 667 668 bool X86TargetLowering::CanLowerReturn( 669 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 670 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context, 671 const Type *RetTy) const { 672 SmallVector<CCValAssign, 16> RVLocs; 673 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 674 return CCInfo.CheckReturn(Outs, RetCC_X86); 675 } 676 677 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { 678 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; 679 return ScratchRegs; 680 } 681 682 ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const { 683 static const MCPhysReg RCRegs[] = {X86::FPCW, X86::MXCSR}; 684 return RCRegs; 685 } 686 687 /// Lowers masks values (v*i1) to the local register values 688 /// \returns DAG node after lowering to register type 689 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, 690 const SDLoc &DL, SelectionDAG &DAG) { 691 EVT ValVT = ValArg.getValueType(); 692 693 if (ValVT == MVT::v1i1) 694 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ValLoc, ValArg, 695 DAG.getIntPtrConstant(0, DL)); 696 697 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) || 698 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) { 699 // Two stage lowering might be required 700 // bitcast: v8i1 -> i8 / v16i1 -> i16 701 // anyextend: i8 -> i32 / i16 -> i32 702 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16; 703 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg); 704 if (ValLoc == MVT::i32) 705 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValToCopy); 706 return ValToCopy; 707 } 708 709 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) || 710 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) { 711 // One stage lowering is required 712 // bitcast: v32i1 -> i32 / v64i1 -> i64 713 return DAG.getBitcast(ValLoc, ValArg); 714 } 715 716 return DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValArg); 717 } 718 719 /// Breaks v64i1 value into two registers and adds the new node to the DAG 720 static void Passv64i1ArgInRegs( 721 const SDLoc &DL, SelectionDAG &DAG, SDValue &Arg, 722 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA, 723 CCValAssign &NextVA, const X86Subtarget &Subtarget) { 724 assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); 725 assert(Subtarget.is32Bit() && "Expecting 32 bit target"); 726 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"); 727 assert(VA.isRegLoc() && NextVA.isRegLoc() && 728 "The value should reside in two registers"); 729 730 // Before splitting the value we cast it to i64 731 Arg = DAG.getBitcast(MVT::i64, Arg); 732 733 // Splitting the value into two i32 types 734 SDValue Lo, Hi; 735 std::tie(Lo, Hi) = DAG.SplitScalar(Arg, DL, MVT::i32, MVT::i32); 736 737 // Attach the two i32 types into corresponding registers 738 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); 739 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi)); 740 } 741 742 SDValue 743 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 744 bool isVarArg, 745 const SmallVectorImpl<ISD::OutputArg> &Outs, 746 const SmallVectorImpl<SDValue> &OutVals, 747 const SDLoc &dl, SelectionDAG &DAG) const { 748 MachineFunction &MF = DAG.getMachineFunction(); 749 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 750 751 // In some cases we need to disable registers from the default CSR list. 752 // For example, when they are used as return registers (preserve_* and X86's 753 // regcall) or for argument passing (X86's regcall). 754 bool ShouldDisableCalleeSavedRegister = 755 shouldDisableRetRegFromCSR(CallConv) || 756 MF.getFunction().hasFnAttribute("no_caller_saved_registers"); 757 758 if (CallConv == CallingConv::X86_INTR && !Outs.empty()) 759 report_fatal_error("X86 interrupts may not return any value"); 760 761 SmallVector<CCValAssign, 16> RVLocs; 762 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); 763 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 764 765 SmallVector<std::pair<Register, SDValue>, 4> RetVals; 766 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E; 767 ++I, ++OutsIndex) { 768 CCValAssign &VA = RVLocs[I]; 769 assert(VA.isRegLoc() && "Can only return in registers!"); 770 771 // Add the register to the CalleeSaveDisableRegs list. 772 if (ShouldDisableCalleeSavedRegister) 773 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg()); 774 775 SDValue ValToCopy = OutVals[OutsIndex]; 776 EVT ValVT = ValToCopy.getValueType(); 777 778 // Promote values to the appropriate types. 779 if (VA.getLocInfo() == CCValAssign::SExt) 780 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 781 else if (VA.getLocInfo() == CCValAssign::ZExt) 782 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 783 else if (VA.getLocInfo() == CCValAssign::AExt) { 784 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) 785 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG); 786 else 787 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 788 } 789 else if (VA.getLocInfo() == CCValAssign::BCvt) 790 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); 791 792 assert(VA.getLocInfo() != CCValAssign::FPExt && 793 "Unexpected FP-extend for return value."); 794 795 // Report an error if we have attempted to return a value via an XMM 796 // register and SSE was disabled. 797 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { 798 errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); 799 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. 800 } else if (!Subtarget.hasSSE2() && 801 X86::FR64XRegClass.contains(VA.getLocReg()) && 802 ValVT == MVT::f64) { 803 // When returning a double via an XMM register, report an error if SSE2 is 804 // not enabled. 805 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); 806 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. 807 } 808 809 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 810 // the RET instruction and handled by the FP Stackifier. 811 if (VA.getLocReg() == X86::FP0 || 812 VA.getLocReg() == X86::FP1) { 813 // If this is a copy from an xmm register to ST(0), use an FPExtend to 814 // change the value to the FP stack register class. 815 if (isScalarFPTypeInSSEReg(VA.getValVT())) 816 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 817 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); 818 // Don't emit a copytoreg. 819 continue; 820 } 821 822 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 823 // which is returned in RAX / RDX. 824 if (Subtarget.is64Bit()) { 825 if (ValVT == MVT::x86mmx) { 826 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 827 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); 828 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 829 ValToCopy); 830 // If we don't have SSE2 available, convert to v4f32 so the generated 831 // register is legal. 832 if (!Subtarget.hasSSE2()) 833 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); 834 } 835 } 836 } 837 838 if (VA.needsCustom()) { 839 assert(VA.getValVT() == MVT::v64i1 && 840 "Currently the only custom case is when we split v64i1 to 2 regs"); 841 842 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I], 843 Subtarget); 844 845 // Add the second register to the CalleeSaveDisableRegs list. 846 if (ShouldDisableCalleeSavedRegister) 847 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); 848 } else { 849 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); 850 } 851 } 852 853 SDValue Glue; 854 SmallVector<SDValue, 6> RetOps; 855 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 856 // Operand #1 = Bytes To Pop 857 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, 858 MVT::i32)); 859 860 // Copy the result values into the output registers. 861 for (auto &RetVal : RetVals) { 862 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) { 863 RetOps.push_back(RetVal.second); 864 continue; // Don't emit a copytoreg. 865 } 866 867 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue); 868 Glue = Chain.getValue(1); 869 RetOps.push_back( 870 DAG.getRegister(RetVal.first, RetVal.second.getValueType())); 871 } 872 873 // Swift calling convention does not require we copy the sret argument 874 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. 875 876 // All x86 ABIs require that for returning structs by value we copy 877 // the sret argument into %rax/%eax (depending on ABI) for the return. 878 // We saved the argument into a virtual register in the entry block, 879 // so now we copy the value out and into %rax/%eax. 880 // 881 // Checking Function.hasStructRetAttr() here is insufficient because the IR 882 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is 883 // false, then an sret argument may be implicitly inserted in the SelDAG. In 884 // either case FuncInfo->setSRetReturnReg() will have been called. 885 if (Register SRetReg = FuncInfo->getSRetReturnReg()) { 886 // When we have both sret and another return value, we should use the 887 // original Chain stored in RetOps[0], instead of the current Chain updated 888 // in the above loop. If we only have sret, RetOps[0] equals to Chain. 889 890 // For the case of sret and another return value, we have 891 // Chain_0 at the function entry 892 // Chain_1 = getCopyToReg(Chain_0) in the above loop 893 // If we use Chain_1 in getCopyFromReg, we will have 894 // Val = getCopyFromReg(Chain_1) 895 // Chain_2 = getCopyToReg(Chain_1, Val) from below 896 897 // getCopyToReg(Chain_0) will be glued together with 898 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be 899 // in Unit B, and we will have cyclic dependency between Unit A and Unit B: 900 // Data dependency from Unit B to Unit A due to usage of Val in 901 // getCopyToReg(Chain_1, Val) 902 // Chain dependency from Unit A to Unit B 903 904 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg. 905 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg, 906 getPointerTy(MF.getDataLayout())); 907 908 Register RetValReg 909 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ? 910 X86::RAX : X86::EAX; 911 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue); 912 Glue = Chain.getValue(1); 913 914 // RAX/EAX now acts like a return value. 915 RetOps.push_back( 916 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); 917 918 // Add the returned register to the CalleeSaveDisableRegs list. Don't do 919 // this however for preserve_most/preserve_all to minimize the number of 920 // callee-saved registers for these CCs. 921 if (ShouldDisableCalleeSavedRegister && 922 CallConv != CallingConv::PreserveAll && 923 CallConv != CallingConv::PreserveMost) 924 MF.getRegInfo().disableCalleeSavedRegister(RetValReg); 925 } 926 927 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); 928 const MCPhysReg *I = 929 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 930 if (I) { 931 for (; *I; ++I) { 932 if (X86::GR64RegClass.contains(*I)) 933 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 934 else 935 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 936 } 937 } 938 939 RetOps[0] = Chain; // Update chain. 940 941 // Add the glue if we have it. 942 if (Glue.getNode()) 943 RetOps.push_back(Glue); 944 945 X86ISD::NodeType opcode = X86ISD::RET_GLUE; 946 if (CallConv == CallingConv::X86_INTR) 947 opcode = X86ISD::IRET; 948 return DAG.getNode(opcode, dl, MVT::Other, RetOps); 949 } 950 951 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 952 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0)) 953 return false; 954 955 SDValue TCChain = Chain; 956 SDNode *Copy = *N->user_begin(); 957 if (Copy->getOpcode() == ISD::CopyToReg) { 958 // If the copy has a glue operand, we conservatively assume it isn't safe to 959 // perform a tail call. 960 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 961 return false; 962 TCChain = Copy->getOperand(0); 963 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 964 return false; 965 966 bool HasRet = false; 967 for (const SDNode *U : Copy->users()) { 968 if (U->getOpcode() != X86ISD::RET_GLUE) 969 return false; 970 // If we are returning more than one value, we can definitely 971 // not make a tail call see PR19530 972 if (U->getNumOperands() > 4) 973 return false; 974 if (U->getNumOperands() == 4 && 975 U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue) 976 return false; 977 HasRet = true; 978 } 979 980 if (!HasRet) 981 return false; 982 983 Chain = TCChain; 984 return true; 985 } 986 987 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, 988 ISD::NodeType ExtendKind) const { 989 MVT ReturnMVT = MVT::i32; 990 991 bool Darwin = Subtarget.getTargetTriple().isOSDarwin(); 992 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) { 993 // The ABI does not require i1, i8 or i16 to be extended. 994 // 995 // On Darwin, there is code in the wild relying on Clang's old behaviour of 996 // always extending i8/i16 return values, so keep doing that for now. 997 // (PR26665). 998 ReturnMVT = MVT::i8; 999 } 1000 1001 EVT MinVT = getRegisterType(Context, ReturnMVT); 1002 return VT.bitsLT(MinVT) ? MinVT : VT; 1003 } 1004 1005 /// Reads two 32 bit registers and creates a 64 bit mask value. 1006 /// \param VA The current 32 bit value that need to be assigned. 1007 /// \param NextVA The next 32 bit value that need to be assigned. 1008 /// \param Root The parent DAG node. 1009 /// \param [in,out] InGlue Represents SDvalue in the parent DAG node for 1010 /// glue purposes. In the case the DAG is already using 1011 /// physical register instead of virtual, we should glue 1012 /// our new SDValue to InGlue SDvalue. 1013 /// \return a new SDvalue of size 64bit. 1014 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, 1015 SDValue &Root, SelectionDAG &DAG, 1016 const SDLoc &DL, const X86Subtarget &Subtarget, 1017 SDValue *InGlue = nullptr) { 1018 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!"); 1019 assert(Subtarget.is32Bit() && "Expecting 32 bit target"); 1020 assert(VA.getValVT() == MVT::v64i1 && 1021 "Expecting first location of 64 bit width type"); 1022 assert(NextVA.getValVT() == VA.getValVT() && 1023 "The locations should have the same type"); 1024 assert(VA.isRegLoc() && NextVA.isRegLoc() && 1025 "The values should reside in two registers"); 1026 1027 SDValue Lo, Hi; 1028 SDValue ArgValueLo, ArgValueHi; 1029 1030 MachineFunction &MF = DAG.getMachineFunction(); 1031 const TargetRegisterClass *RC = &X86::GR32RegClass; 1032 1033 // Read a 32 bit value from the registers. 1034 if (nullptr == InGlue) { 1035 // When no physical register is present, 1036 // create an intermediate virtual register. 1037 Register Reg = MF.addLiveIn(VA.getLocReg(), RC); 1038 ArgValueLo = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32); 1039 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 1040 ArgValueHi = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32); 1041 } else { 1042 // When a physical register is available read the value from it and glue 1043 // the reads together. 1044 ArgValueLo = 1045 DAG.getCopyFromReg(Root, DL, VA.getLocReg(), MVT::i32, *InGlue); 1046 *InGlue = ArgValueLo.getValue(2); 1047 ArgValueHi = 1048 DAG.getCopyFromReg(Root, DL, NextVA.getLocReg(), MVT::i32, *InGlue); 1049 *InGlue = ArgValueHi.getValue(2); 1050 } 1051 1052 // Convert the i32 type into v32i1 type. 1053 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo); 1054 1055 // Convert the i32 type into v32i1 type. 1056 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); 1057 1058 // Concatenate the two values together. 1059 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v64i1, Lo, Hi); 1060 } 1061 1062 /// The function will lower a register of various sizes (8/16/32/64) 1063 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1) 1064 /// \returns a DAG node contains the operand after lowering to mask type. 1065 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, 1066 const EVT &ValLoc, const SDLoc &DL, 1067 SelectionDAG &DAG) { 1068 SDValue ValReturned = ValArg; 1069 1070 if (ValVT == MVT::v1i1) 1071 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, ValReturned); 1072 1073 if (ValVT == MVT::v64i1) { 1074 // In 32 bit machine, this case is handled by getv64i1Argument 1075 assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); 1076 // In 64 bit machine, There is no need to truncate the value only bitcast 1077 } else { 1078 MVT MaskLenVT; 1079 switch (ValVT.getSimpleVT().SimpleTy) { 1080 case MVT::v8i1: 1081 MaskLenVT = MVT::i8; 1082 break; 1083 case MVT::v16i1: 1084 MaskLenVT = MVT::i16; 1085 break; 1086 case MVT::v32i1: 1087 MaskLenVT = MVT::i32; 1088 break; 1089 default: 1090 llvm_unreachable("Expecting a vector of i1 types"); 1091 } 1092 1093 ValReturned = DAG.getNode(ISD::TRUNCATE, DL, MaskLenVT, ValReturned); 1094 } 1095 return DAG.getBitcast(ValVT, ValReturned); 1096 } 1097 1098 /// Lower the result values of a call into the 1099 /// appropriate copies out of appropriate physical registers. 1100 /// 1101 SDValue X86TargetLowering::LowerCallResult( 1102 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, 1103 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1104 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 1105 uint32_t *RegMask) const { 1106 1107 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); 1108 // Assign locations to each value returned by this call. 1109 SmallVector<CCValAssign, 16> RVLocs; 1110 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1111 *DAG.getContext()); 1112 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1113 1114 // Copy all of the result registers out of their specified physreg. 1115 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E; 1116 ++I, ++InsIndex) { 1117 CCValAssign &VA = RVLocs[I]; 1118 EVT CopyVT = VA.getLocVT(); 1119 1120 // In some calling conventions we need to remove the used registers 1121 // from the register mask. 1122 if (RegMask) { 1123 for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg())) 1124 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32)); 1125 } 1126 1127 // Report an error if there was an attempt to return FP values via XMM 1128 // registers. 1129 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { 1130 errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); 1131 if (VA.getLocReg() == X86::XMM1) 1132 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. 1133 else 1134 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. 1135 } else if (!Subtarget.hasSSE2() && 1136 X86::FR64XRegClass.contains(VA.getLocReg()) && 1137 CopyVT == MVT::f64) { 1138 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); 1139 if (VA.getLocReg() == X86::XMM1) 1140 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. 1141 else 1142 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. 1143 } 1144 1145 // If we prefer to use the value in xmm registers, copy it out as f80 and 1146 // use a truncate to move it from fp stack reg to xmm reg. 1147 bool RoundAfterCopy = false; 1148 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && 1149 isScalarFPTypeInSSEReg(VA.getValVT())) { 1150 if (!Subtarget.hasX87()) 1151 report_fatal_error("X87 register return with X87 disabled"); 1152 CopyVT = MVT::f80; 1153 RoundAfterCopy = (CopyVT != VA.getLocVT()); 1154 } 1155 1156 SDValue Val; 1157 if (VA.needsCustom()) { 1158 assert(VA.getValVT() == MVT::v64i1 && 1159 "Currently the only custom case is when we split v64i1 to 2 regs"); 1160 Val = 1161 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue); 1162 } else { 1163 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue) 1164 .getValue(1); 1165 Val = Chain.getValue(0); 1166 InGlue = Chain.getValue(2); 1167 } 1168 1169 if (RoundAfterCopy) 1170 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1171 // This truncation won't change the value. 1172 DAG.getIntPtrConstant(1, dl, /*isTarget=*/true)); 1173 1174 if (VA.isExtInLoc()) { 1175 if (VA.getValVT().isVector() && 1176 VA.getValVT().getScalarType() == MVT::i1 && 1177 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || 1178 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { 1179 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 1180 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG); 1181 } else 1182 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 1183 } 1184 1185 if (VA.getLocInfo() == CCValAssign::BCvt) 1186 Val = DAG.getBitcast(VA.getValVT(), Val); 1187 1188 InVals.push_back(Val); 1189 } 1190 1191 return Chain; 1192 } 1193 1194 //===----------------------------------------------------------------------===// 1195 // C & StdCall & Fast Calling Convention implementation 1196 //===----------------------------------------------------------------------===// 1197 // StdCall calling convention seems to be standard for many Windows' API 1198 // routines and around. It differs from C calling convention just a little: 1199 // callee should clean up the stack, not caller. Symbols should be also 1200 // decorated in some fancy way :) It doesn't support any vector arguments. 1201 // For info on fast calling convention see Fast Calling Convention (tail call) 1202 // implementation LowerX86_32FastCCCallTo. 1203 1204 /// Determines whether Args, either a set of outgoing arguments to a call, or a 1205 /// set of incoming args of a call, contains an sret pointer that the callee 1206 /// pops 1207 template <typename T> 1208 static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args, 1209 const X86Subtarget &Subtarget) { 1210 // Not C++20 (yet), so no concepts available. 1211 static_assert(std::is_same_v<T, ISD::OutputArg> || 1212 std::is_same_v<T, ISD::InputArg>, 1213 "requires ISD::OutputArg or ISD::InputArg"); 1214 1215 // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out 1216 // for most compilations. 1217 if (!Subtarget.is32Bit()) 1218 return false; 1219 1220 if (Args.empty()) 1221 return false; 1222 1223 // Most calls do not have an sret argument, check the arg next. 1224 const ISD::ArgFlagsTy &Flags = Args[0].Flags; 1225 if (!Flags.isSRet() || Flags.isInReg()) 1226 return false; 1227 1228 // The MSVCabi does not pop the sret. 1229 if (Subtarget.getTargetTriple().isOSMSVCRT()) 1230 return false; 1231 1232 // MCUs don't pop the sret 1233 if (Subtarget.isTargetMCU()) 1234 return false; 1235 1236 // Callee pops argument 1237 return true; 1238 } 1239 1240 /// Make a copy of an aggregate at address specified by "Src" to address 1241 /// "Dst" with size and alignment information specified by the specific 1242 /// parameter attribute. The copy will be passed as a byval function parameter. 1243 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, 1244 SDValue Chain, ISD::ArgFlagsTy Flags, 1245 SelectionDAG &DAG, const SDLoc &dl) { 1246 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl); 1247 1248 return DAG.getMemcpy( 1249 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), 1250 /*isVolatile*/ false, /*AlwaysInline=*/true, 1251 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo()); 1252 } 1253 1254 /// Return true if the calling convention is one that we can guarantee TCO for. 1255 static bool canGuaranteeTCO(CallingConv::ID CC) { 1256 return (CC == CallingConv::Fast || CC == CallingConv::GHC || 1257 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || 1258 CC == CallingConv::Tail || CC == CallingConv::SwiftTail); 1259 } 1260 1261 /// Return true if we might ever do TCO for calls with this calling convention. 1262 static bool mayTailCallThisCC(CallingConv::ID CC) { 1263 switch (CC) { 1264 // C calling conventions: 1265 case CallingConv::C: 1266 case CallingConv::Win64: 1267 case CallingConv::X86_64_SysV: 1268 case CallingConv::PreserveNone: 1269 // Callee pop conventions: 1270 case CallingConv::X86_ThisCall: 1271 case CallingConv::X86_StdCall: 1272 case CallingConv::X86_VectorCall: 1273 case CallingConv::X86_FastCall: 1274 // Swift: 1275 case CallingConv::Swift: 1276 return true; 1277 default: 1278 return canGuaranteeTCO(CC); 1279 } 1280 } 1281 1282 /// Return true if the function is being made into a tailcall target by 1283 /// changing its ABI. 1284 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { 1285 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || 1286 CC == CallingConv::Tail || CC == CallingConv::SwiftTail; 1287 } 1288 1289 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 1290 if (!CI->isTailCall()) 1291 return false; 1292 1293 CallingConv::ID CalleeCC = CI->getCallingConv(); 1294 if (!mayTailCallThisCC(CalleeCC)) 1295 return false; 1296 1297 return true; 1298 } 1299 1300 SDValue 1301 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, 1302 const SmallVectorImpl<ISD::InputArg> &Ins, 1303 const SDLoc &dl, SelectionDAG &DAG, 1304 const CCValAssign &VA, 1305 MachineFrameInfo &MFI, unsigned i) const { 1306 // Create the nodes corresponding to a load from this parameter slot. 1307 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1308 bool AlwaysUseMutable = shouldGuaranteeTCO( 1309 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); 1310 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1311 EVT ValVT; 1312 MVT PtrVT = getPointerTy(DAG.getDataLayout()); 1313 1314 // If value is passed by pointer we have address passed instead of the value 1315 // itself. No need to extend if the mask value and location share the same 1316 // absolute size. 1317 bool ExtendedInMem = 1318 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 && 1319 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits(); 1320 1321 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) 1322 ValVT = VA.getLocVT(); 1323 else 1324 ValVT = VA.getValVT(); 1325 1326 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1327 // changed with more analysis. 1328 // In case of tail call optimization mark all arguments mutable. Since they 1329 // could be overwritten by lowering of arguments in case of a tail call. 1330 if (Flags.isByVal()) { 1331 unsigned Bytes = Flags.getByValSize(); 1332 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1333 1334 // FIXME: For now, all byval parameter objects are marked as aliasing. This 1335 // can be improved with deeper analysis. 1336 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable, 1337 /*isAliased=*/true); 1338 return DAG.getFrameIndex(FI, PtrVT); 1339 } 1340 1341 EVT ArgVT = Ins[i].ArgVT; 1342 1343 // If this is a vector that has been split into multiple parts, don't elide 1344 // the copy. The layout on the stack may not match the packed in-memory 1345 // layout. 1346 bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector(); 1347 1348 // This is an argument in memory. We might be able to perform copy elision. 1349 // If the argument is passed directly in memory without any extension, then we 1350 // can perform copy elision. Large vector types, for example, may be passed 1351 // indirectly by pointer. 1352 if (Flags.isCopyElisionCandidate() && 1353 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem && 1354 !ScalarizedVector) { 1355 SDValue PartAddr; 1356 if (Ins[i].PartOffset == 0) { 1357 // If this is a one-part value or the first part of a multi-part value, 1358 // create a stack object for the entire argument value type and return a 1359 // load from our portion of it. This assumes that if the first part of an 1360 // argument is in memory, the rest will also be in memory. 1361 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), 1362 /*IsImmutable=*/false); 1363 PartAddr = DAG.getFrameIndex(FI, PtrVT); 1364 return DAG.getLoad( 1365 ValVT, dl, Chain, PartAddr, 1366 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 1367 } 1368 1369 // This is not the first piece of an argument in memory. See if there is 1370 // already a fixed stack object including this offset. If so, assume it 1371 // was created by the PartOffset == 0 branch above and create a load from 1372 // the appropriate offset into it. 1373 int64_t PartBegin = VA.getLocMemOffset(); 1374 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8; 1375 int FI = MFI.getObjectIndexBegin(); 1376 for (; MFI.isFixedObjectIndex(FI); ++FI) { 1377 int64_t ObjBegin = MFI.getObjectOffset(FI); 1378 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI); 1379 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd) 1380 break; 1381 } 1382 if (MFI.isFixedObjectIndex(FI)) { 1383 SDValue Addr = 1384 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT), 1385 DAG.getIntPtrConstant(Ins[i].PartOffset, dl)); 1386 return DAG.getLoad(ValVT, dl, Chain, Addr, 1387 MachinePointerInfo::getFixedStack( 1388 DAG.getMachineFunction(), FI, Ins[i].PartOffset)); 1389 } 1390 } 1391 1392 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, 1393 VA.getLocMemOffset(), isImmutable); 1394 1395 // Set SExt or ZExt flag. 1396 if (VA.getLocInfo() == CCValAssign::ZExt) { 1397 MFI.setObjectZExt(FI, true); 1398 } else if (VA.getLocInfo() == CCValAssign::SExt) { 1399 MFI.setObjectSExt(FI, true); 1400 } 1401 1402 MaybeAlign Alignment; 1403 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && 1404 ValVT != MVT::f80) 1405 Alignment = MaybeAlign(4); 1406 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 1407 SDValue Val = DAG.getLoad( 1408 ValVT, dl, Chain, FIN, 1409 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 1410 Alignment); 1411 return ExtendedInMem 1412 ? (VA.getValVT().isVector() 1413 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) 1414 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) 1415 : Val; 1416 } 1417 1418 // FIXME: Get this from tablegen. 1419 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, 1420 const X86Subtarget &Subtarget) { 1421 assert(Subtarget.is64Bit()); 1422 1423 if (Subtarget.isCallingConvWin64(CallConv)) { 1424 static const MCPhysReg GPR64ArgRegsWin64[] = { 1425 X86::RCX, X86::RDX, X86::R8, X86::R9 1426 }; 1427 return GPR64ArgRegsWin64; 1428 } 1429 1430 static const MCPhysReg GPR64ArgRegs64Bit[] = { 1431 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1432 }; 1433 return GPR64ArgRegs64Bit; 1434 } 1435 1436 // FIXME: Get this from tablegen. 1437 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, 1438 CallingConv::ID CallConv, 1439 const X86Subtarget &Subtarget) { 1440 assert(Subtarget.is64Bit()); 1441 if (Subtarget.isCallingConvWin64(CallConv)) { 1442 // The XMM registers which might contain var arg parameters are shadowed 1443 // in their paired GPR. So we only need to save the GPR to their home 1444 // slots. 1445 // TODO: __vectorcall will change this. 1446 return {}; 1447 } 1448 1449 bool isSoftFloat = Subtarget.useSoftFloat(); 1450 if (isSoftFloat || !Subtarget.hasSSE1()) 1451 // Kernel mode asks for SSE to be disabled, so there are no XMM argument 1452 // registers. 1453 return {}; 1454 1455 static const MCPhysReg XMMArgRegs64Bit[] = { 1456 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1457 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1458 }; 1459 return XMMArgRegs64Bit; 1460 } 1461 1462 #ifndef NDEBUG 1463 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) { 1464 return llvm::is_sorted( 1465 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool { 1466 return A.getValNo() < B.getValNo(); 1467 }); 1468 } 1469 #endif 1470 1471 namespace { 1472 /// This is a helper class for lowering variable arguments parameters. 1473 class VarArgsLoweringHelper { 1474 public: 1475 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc, 1476 SelectionDAG &DAG, const X86Subtarget &Subtarget, 1477 CallingConv::ID CallConv, CCState &CCInfo) 1478 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget), 1479 TheMachineFunction(DAG.getMachineFunction()), 1480 TheFunction(TheMachineFunction.getFunction()), 1481 FrameInfo(TheMachineFunction.getFrameInfo()), 1482 FrameLowering(*Subtarget.getFrameLowering()), 1483 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv), 1484 CCInfo(CCInfo) {} 1485 1486 // Lower variable arguments parameters. 1487 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize); 1488 1489 private: 1490 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize); 1491 1492 void forwardMustTailParameters(SDValue &Chain); 1493 1494 bool is64Bit() const { return Subtarget.is64Bit(); } 1495 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); } 1496 1497 X86MachineFunctionInfo *FuncInfo; 1498 const SDLoc &DL; 1499 SelectionDAG &DAG; 1500 const X86Subtarget &Subtarget; 1501 MachineFunction &TheMachineFunction; 1502 const Function &TheFunction; 1503 MachineFrameInfo &FrameInfo; 1504 const TargetFrameLowering &FrameLowering; 1505 const TargetLowering &TargLowering; 1506 CallingConv::ID CallConv; 1507 CCState &CCInfo; 1508 }; 1509 } // namespace 1510 1511 void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters( 1512 SDValue &Chain, unsigned StackSize) { 1513 // If the function takes variable number of arguments, make a frame index for 1514 // the start of the first vararg value... for expansion of llvm.va_start. We 1515 // can skip this if there are no va_start calls. 1516 if (is64Bit() || (CallConv != CallingConv::X86_FastCall && 1517 CallConv != CallingConv::X86_ThisCall)) { 1518 FuncInfo->setVarArgsFrameIndex( 1519 FrameInfo.CreateFixedObject(1, StackSize, true)); 1520 } 1521 1522 // 64-bit calling conventions support varargs and register parameters, so we 1523 // have to do extra work to spill them in the prologue. 1524 if (is64Bit()) { 1525 // Find the first unallocated argument registers. 1526 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); 1527 ArrayRef<MCPhysReg> ArgXMMs = 1528 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget); 1529 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); 1530 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); 1531 1532 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) && 1533 "SSE register cannot be used when SSE is disabled!"); 1534 1535 if (isWin64()) { 1536 // Get to the caller-allocated home save location. Add 8 to account 1537 // for the return address. 1538 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8; 1539 FuncInfo->setRegSaveFrameIndex( 1540 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1541 // Fixup to set vararg frame on shadow area (4 x i64). 1542 if (NumIntRegs < 4) 1543 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1544 } else { 1545 // For X86-64, if there are vararg parameters that are passed via 1546 // registers, then we must store them to their spots on the stack so 1547 // they may be loaded by dereferencing the result of va_next. 1548 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1549 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); 1550 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject( 1551 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false)); 1552 } 1553 1554 SmallVector<SDValue, 6> 1555 LiveGPRs; // list of SDValue for GPR registers keeping live input value 1556 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers 1557 // keeping live input value 1558 SDValue ALVal; // if applicable keeps SDValue for %al register 1559 1560 // Gather all the live in physical registers. 1561 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { 1562 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass); 1563 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64)); 1564 } 1565 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs); 1566 if (!AvailableXmms.empty()) { 1567 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); 1568 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8); 1569 for (MCPhysReg Reg : AvailableXmms) { 1570 // FastRegisterAllocator spills virtual registers at basic 1571 // block boundary. That leads to usages of xmm registers 1572 // outside of check for %al. Pass physical registers to 1573 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling. 1574 TheMachineFunction.getRegInfo().addLiveIn(Reg); 1575 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32)); 1576 } 1577 } 1578 1579 // Store the integer parameter registers. 1580 SmallVector<SDValue, 8> MemOps; 1581 SDValue RSFIN = 1582 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1583 TargLowering.getPointerTy(DAG.getDataLayout())); 1584 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1585 for (SDValue Val : LiveGPRs) { 1586 SDValue FIN = DAG.getNode(ISD::ADD, DL, 1587 TargLowering.getPointerTy(DAG.getDataLayout()), 1588 RSFIN, DAG.getIntPtrConstant(Offset, DL)); 1589 SDValue Store = 1590 DAG.getStore(Val.getValue(1), DL, Val, FIN, 1591 MachinePointerInfo::getFixedStack( 1592 DAG.getMachineFunction(), 1593 FuncInfo->getRegSaveFrameIndex(), Offset)); 1594 MemOps.push_back(Store); 1595 Offset += 8; 1596 } 1597 1598 // Now store the XMM (fp + vector) parameter registers. 1599 if (!LiveXMMRegs.empty()) { 1600 SmallVector<SDValue, 12> SaveXMMOps; 1601 SaveXMMOps.push_back(Chain); 1602 SaveXMMOps.push_back(ALVal); 1603 SaveXMMOps.push_back(RSFIN); 1604 SaveXMMOps.push_back( 1605 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32)); 1606 llvm::append_range(SaveXMMOps, LiveXMMRegs); 1607 MachineMemOperand *StoreMMO = 1608 DAG.getMachineFunction().getMachineMemOperand( 1609 MachinePointerInfo::getFixedStack( 1610 DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(), 1611 Offset), 1612 MachineMemOperand::MOStore, 128, Align(16)); 1613 MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS, 1614 DL, DAG.getVTList(MVT::Other), 1615 SaveXMMOps, MVT::i8, StoreMMO)); 1616 } 1617 1618 if (!MemOps.empty()) 1619 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 1620 } 1621 } 1622 1623 void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) { 1624 // Find the largest legal vector type. 1625 MVT VecVT = MVT::Other; 1626 // FIXME: Only some x86_32 calling conventions support AVX512. 1627 if (Subtarget.useAVX512Regs() && 1628 (is64Bit() || (CallConv == CallingConv::X86_VectorCall || 1629 CallConv == CallingConv::Intel_OCL_BI))) 1630 VecVT = MVT::v16f32; 1631 else if (Subtarget.hasAVX()) 1632 VecVT = MVT::v8f32; 1633 else if (Subtarget.hasSSE2()) 1634 VecVT = MVT::v4f32; 1635 1636 // We forward some GPRs and some vector types. 1637 SmallVector<MVT, 2> RegParmTypes; 1638 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32; 1639 RegParmTypes.push_back(IntVT); 1640 if (VecVT != MVT::Other) 1641 RegParmTypes.push_back(VecVT); 1642 1643 // Compute the set of forwarded registers. The rest are scratch. 1644 SmallVectorImpl<ForwardedRegister> &Forwards = 1645 FuncInfo->getForwardedMustTailRegParms(); 1646 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); 1647 1648 // Forward AL for SysV x86_64 targets, since it is used for varargs. 1649 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) { 1650 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); 1651 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); 1652 } 1653 1654 // Copy all forwards from physical to virtual registers. 1655 for (ForwardedRegister &FR : Forwards) { 1656 // FIXME: Can we use a less constrained schedule? 1657 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT); 1658 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister( 1659 TargLowering.getRegClassFor(FR.VT)); 1660 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal); 1661 } 1662 } 1663 1664 void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain, 1665 unsigned StackSize) { 1666 // Set FrameIndex to the 0xAAAAAAA value to mark unset state. 1667 // If necessary, it would be set into the correct value later. 1668 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1669 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1670 1671 if (FrameInfo.hasVAStart()) 1672 createVarArgAreaAndStoreRegisters(Chain, StackSize); 1673 1674 if (FrameInfo.hasMustTailInVarArgFunc()) 1675 forwardMustTailParameters(Chain); 1676 } 1677 1678 SDValue X86TargetLowering::LowerFormalArguments( 1679 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, 1680 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1681 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 1682 MachineFunction &MF = DAG.getMachineFunction(); 1683 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1684 1685 const Function &F = MF.getFunction(); 1686 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() && 1687 F.getName() == "main") 1688 FuncInfo->setForceFramePointer(true); 1689 1690 MachineFrameInfo &MFI = MF.getFrameInfo(); 1691 bool Is64Bit = Subtarget.is64Bit(); 1692 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); 1693 1694 assert( 1695 !(IsVarArg && canGuaranteeTCO(CallConv)) && 1696 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); 1697 1698 // Assign locations to all of the incoming arguments. 1699 SmallVector<CCValAssign, 16> ArgLocs; 1700 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); 1701 1702 // Allocate shadow area for Win64. 1703 if (IsWin64) 1704 CCInfo.AllocateStack(32, Align(8)); 1705 1706 CCInfo.AnalyzeArguments(Ins, CC_X86); 1707 1708 // In vectorcall calling convention a second pass is required for the HVA 1709 // types. 1710 if (CallingConv::X86_VectorCall == CallConv) { 1711 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86); 1712 } 1713 1714 // The next loop assumes that the locations are in the same order of the 1715 // input arguments. 1716 assert(isSortedByValueNo(ArgLocs) && 1717 "Argument Location list must be sorted before lowering"); 1718 1719 SDValue ArgValue; 1720 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; 1721 ++I, ++InsIndex) { 1722 assert(InsIndex < Ins.size() && "Invalid Ins index"); 1723 CCValAssign &VA = ArgLocs[I]; 1724 1725 if (VA.isRegLoc()) { 1726 EVT RegVT = VA.getLocVT(); 1727 if (VA.needsCustom()) { 1728 assert( 1729 VA.getValVT() == MVT::v64i1 && 1730 "Currently the only custom case is when we split v64i1 to 2 regs"); 1731 1732 // v64i1 values, in regcall calling convention, that are 1733 // compiled to 32 bit arch, are split up into two registers. 1734 ArgValue = 1735 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); 1736 } else { 1737 const TargetRegisterClass *RC; 1738 if (RegVT == MVT::i8) 1739 RC = &X86::GR8RegClass; 1740 else if (RegVT == MVT::i16) 1741 RC = &X86::GR16RegClass; 1742 else if (RegVT == MVT::i32) 1743 RC = &X86::GR32RegClass; 1744 else if (Is64Bit && RegVT == MVT::i64) 1745 RC = &X86::GR64RegClass; 1746 else if (RegVT == MVT::f16) 1747 RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass; 1748 else if (RegVT == MVT::f32) 1749 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; 1750 else if (RegVT == MVT::f64) 1751 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; 1752 else if (RegVT == MVT::f80) 1753 RC = &X86::RFP80RegClass; 1754 else if (RegVT == MVT::f128) 1755 RC = &X86::VR128RegClass; 1756 else if (RegVT.is512BitVector()) 1757 RC = &X86::VR512RegClass; 1758 else if (RegVT.is256BitVector()) 1759 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass; 1760 else if (RegVT.is128BitVector()) 1761 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; 1762 else if (RegVT == MVT::x86mmx) 1763 RC = &X86::VR64RegClass; 1764 else if (RegVT == MVT::v1i1) 1765 RC = &X86::VK1RegClass; 1766 else if (RegVT == MVT::v8i1) 1767 RC = &X86::VK8RegClass; 1768 else if (RegVT == MVT::v16i1) 1769 RC = &X86::VK16RegClass; 1770 else if (RegVT == MVT::v32i1) 1771 RC = &X86::VK32RegClass; 1772 else if (RegVT == MVT::v64i1) 1773 RC = &X86::VK64RegClass; 1774 else 1775 llvm_unreachable("Unknown argument type!"); 1776 1777 Register Reg = MF.addLiveIn(VA.getLocReg(), RC); 1778 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1779 } 1780 1781 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1782 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1783 // right size. 1784 if (VA.getLocInfo() == CCValAssign::SExt) 1785 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1786 DAG.getValueType(VA.getValVT())); 1787 else if (VA.getLocInfo() == CCValAssign::ZExt) 1788 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1789 DAG.getValueType(VA.getValVT())); 1790 else if (VA.getLocInfo() == CCValAssign::BCvt) 1791 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); 1792 1793 if (VA.isExtInLoc()) { 1794 // Handle MMX values passed in XMM regs. 1795 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) 1796 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); 1797 else if (VA.getValVT().isVector() && 1798 VA.getValVT().getScalarType() == MVT::i1 && 1799 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || 1800 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { 1801 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 1802 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG); 1803 } else 1804 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1805 } 1806 } else { 1807 assert(VA.isMemLoc()); 1808 ArgValue = 1809 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex); 1810 } 1811 1812 // If value is passed via pointer - do a load. 1813 if (VA.getLocInfo() == CCValAssign::Indirect && 1814 !(Ins[I].Flags.isByVal() && VA.isRegLoc())) { 1815 ArgValue = 1816 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo()); 1817 } 1818 1819 InVals.push_back(ArgValue); 1820 } 1821 1822 for (unsigned I = 0, E = Ins.size(); I != E; ++I) { 1823 if (Ins[I].Flags.isSwiftAsync()) { 1824 auto X86FI = MF.getInfo<X86MachineFunctionInfo>(); 1825 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) 1826 X86FI->setHasSwiftAsyncContext(true); 1827 else { 1828 int PtrSize = Subtarget.is64Bit() ? 8 : 4; 1829 int FI = 1830 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize), false); 1831 X86FI->setSwiftAsyncContextFrameIdx(FI); 1832 SDValue St = DAG.getStore( 1833 DAG.getEntryNode(), dl, InVals[I], 1834 DAG.getFrameIndex(FI, PtrSize == 8 ? MVT::i64 : MVT::i32), 1835 MachinePointerInfo::getFixedStack(MF, FI)); 1836 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain); 1837 } 1838 } 1839 1840 // Swift calling convention does not require we copy the sret argument 1841 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. 1842 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail) 1843 continue; 1844 1845 // All x86 ABIs require that for returning structs by value we copy the 1846 // sret argument into %rax/%eax (depending on ABI) for the return. Save 1847 // the argument into a virtual register so that we can access it from the 1848 // return points. 1849 if (Ins[I].Flags.isSRet()) { 1850 assert(!FuncInfo->getSRetReturnReg() && 1851 "SRet return has already been set"); 1852 MVT PtrTy = getPointerTy(DAG.getDataLayout()); 1853 Register Reg = 1854 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 1855 FuncInfo->setSRetReturnReg(Reg); 1856 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]); 1857 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1858 break; 1859 } 1860 } 1861 1862 unsigned StackSize = CCInfo.getStackSize(); 1863 // Align stack specially for tail calls. 1864 if (shouldGuaranteeTCO(CallConv, 1865 MF.getTarget().Options.GuaranteedTailCallOpt)) 1866 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1867 1868 if (IsVarArg) 1869 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo) 1870 .lowerVarArgsParameters(Chain, StackSize); 1871 1872 // Some CCs need callee pop. 1873 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg, 1874 MF.getTarget().Options.GuaranteedTailCallOpt)) { 1875 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1876 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { 1877 // X86 interrupts must pop the error code (and the alignment padding) if 1878 // present. 1879 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4); 1880 } else { 1881 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1882 // If this is an sret function, the return should pop the hidden pointer. 1883 if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget)) 1884 FuncInfo->setBytesToPopOnReturn(4); 1885 } 1886 1887 if (!Is64Bit) { 1888 // RegSaveFrameIndex is X86-64 only. 1889 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1890 } 1891 1892 FuncInfo->setArgumentStackSize(StackSize); 1893 1894 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { 1895 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn()); 1896 if (Personality == EHPersonality::CoreCLR) { 1897 assert(Is64Bit); 1898 // TODO: Add a mechanism to frame lowering that will allow us to indicate 1899 // that we'd prefer this slot be allocated towards the bottom of the frame 1900 // (i.e. near the stack pointer after allocating the frame). Every 1901 // funclet needs a copy of this slot in its (mostly empty) frame, and the 1902 // offset from the bottom of this and each funclet's frame must be the 1903 // same, so the size of funclets' (mostly empty) frames is dictated by 1904 // how far this slot is from the bottom (since they allocate just enough 1905 // space to accommodate holding this slot at the correct offset). 1906 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false); 1907 EHInfo->PSPSymFrameIdx = PSPSymFI; 1908 } 1909 } 1910 1911 if (shouldDisableArgRegFromCSR(CallConv) || 1912 F.hasFnAttribute("no_caller_saved_registers")) { 1913 MachineRegisterInfo &MRI = MF.getRegInfo(); 1914 for (std::pair<MCRegister, Register> Pair : MRI.liveins()) 1915 MRI.disableCalleeSavedRegister(Pair.first); 1916 } 1917 1918 if (CallingConv::PreserveNone == CallConv) 1919 for (unsigned I = 0, E = Ins.size(); I != E; ++I) { 1920 if (Ins[I].Flags.isSwiftSelf() || Ins[I].Flags.isSwiftAsync() || 1921 Ins[I].Flags.isSwiftError()) { 1922 errorUnsupported(DAG, dl, 1923 "Swift attributes can't be used with preserve_none"); 1924 break; 1925 } 1926 } 1927 1928 return Chain; 1929 } 1930 1931 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 1932 SDValue Arg, const SDLoc &dl, 1933 SelectionDAG &DAG, 1934 const CCValAssign &VA, 1935 ISD::ArgFlagsTy Flags, 1936 bool isByVal) const { 1937 unsigned LocMemOffset = VA.getLocMemOffset(); 1938 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1939 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 1940 StackPtr, PtrOff); 1941 if (isByVal) 1942 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1943 1944 MaybeAlign Alignment; 1945 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && 1946 Arg.getSimpleValueType() != MVT::f80) 1947 Alignment = MaybeAlign(4); 1948 return DAG.getStore( 1949 Chain, dl, Arg, PtrOff, 1950 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), 1951 Alignment); 1952 } 1953 1954 /// Emit a load of return address if tail call 1955 /// optimization is performed and it is required. 1956 SDValue X86TargetLowering::EmitTailCallLoadRetAddr( 1957 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, 1958 bool Is64Bit, int FPDiff, const SDLoc &dl) const { 1959 // Adjust the Return address stack slot. 1960 EVT VT = getPointerTy(DAG.getDataLayout()); 1961 OutRetAddr = getReturnAddressFrameIndex(DAG); 1962 1963 // Load the "old" Return address. 1964 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo()); 1965 return SDValue(OutRetAddr.getNode(), 1); 1966 } 1967 1968 /// Emit a store of the return address if tail call 1969 /// optimization is performed and it is required (FPDiff!=0). 1970 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, 1971 SDValue Chain, SDValue RetAddrFrIdx, 1972 EVT PtrVT, unsigned SlotSize, 1973 int FPDiff, const SDLoc &dl) { 1974 // Store the return address to the appropriate stack slot. 1975 if (!FPDiff) return Chain; 1976 // Calculate the new stack slot for the return address. 1977 int NewReturnAddrFI = 1978 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, 1979 false); 1980 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); 1981 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1982 MachinePointerInfo::getFixedStack( 1983 DAG.getMachineFunction(), NewReturnAddrFI)); 1984 return Chain; 1985 } 1986 1987 /// Returns a vector_shuffle mask for an movs{s|d}, movd 1988 /// operation of specified width. 1989 SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, 1990 SDValue V1, SDValue V2) const { 1991 unsigned NumElems = VT.getVectorNumElements(); 1992 SmallVector<int, 8> Mask; 1993 Mask.push_back(NumElems); 1994 for (unsigned i = 1; i != NumElems; ++i) 1995 Mask.push_back(i); 1996 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); 1997 } 1998 1999 SDValue 2000 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2001 SmallVectorImpl<SDValue> &InVals) const { 2002 SelectionDAG &DAG = CLI.DAG; 2003 SDLoc &dl = CLI.DL; 2004 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2005 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2006 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2007 SDValue Chain = CLI.Chain; 2008 SDValue Callee = CLI.Callee; 2009 CallingConv::ID CallConv = CLI.CallConv; 2010 bool &isTailCall = CLI.IsTailCall; 2011 bool isVarArg = CLI.IsVarArg; 2012 const auto *CB = CLI.CB; 2013 2014 MachineFunction &MF = DAG.getMachineFunction(); 2015 bool Is64Bit = Subtarget.is64Bit(); 2016 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); 2017 bool IsSibcall = false; 2018 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || 2019 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail; 2020 bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget); 2021 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); 2022 bool HasNCSR = (CB && isa<CallInst>(CB) && 2023 CB->hasFnAttr("no_caller_saved_registers")); 2024 bool HasNoCfCheck = (CB && CB->doesNoCfCheck()); 2025 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall()); 2026 bool IsCFICall = IsIndirectCall && CLI.CFIType; 2027 const Module *M = MF.getFunction().getParent(); 2028 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); 2029 2030 MachineFunction::CallSiteInfo CSInfo; 2031 if (CallConv == CallingConv::X86_INTR) 2032 report_fatal_error("X86 interrupts may not be called directly"); 2033 2034 // Analyze operands of the call, assigning locations to each operand. 2035 SmallVector<CCValAssign, 16> ArgLocs; 2036 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 2037 2038 // Allocate shadow area for Win64. 2039 if (IsWin64) 2040 CCInfo.AllocateStack(32, Align(8)); 2041 2042 CCInfo.AnalyzeArguments(Outs, CC_X86); 2043 2044 // In vectorcall calling convention a second pass is required for the HVA 2045 // types. 2046 if (CallingConv::X86_VectorCall == CallConv) { 2047 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86); 2048 } 2049 2050 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall(); 2051 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) { 2052 // If we are using a GOT, disable tail calls to external symbols with 2053 // default visibility. Tail calling such a symbol requires using a GOT 2054 // relocation, which forces early binding of the symbol. This breaks code 2055 // that require lazy function symbol resolution. Using musttail or 2056 // GuaranteedTailCallOpt will override this. 2057 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2058 if (!G || (!G->getGlobal()->hasLocalLinkage() && 2059 G->getGlobal()->hasDefaultVisibility())) 2060 isTailCall = false; 2061 } 2062 2063 if (isTailCall && !IsMustTail) { 2064 // Check if it's really possible to do a tail call. 2065 isTailCall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, 2066 IsCalleePopSRet); 2067 2068 // Sibcalls are automatically detected tailcalls which do not require 2069 // ABI changes. 2070 if (!IsGuaranteeTCO && isTailCall) 2071 IsSibcall = true; 2072 2073 if (isTailCall) 2074 ++NumTailCalls; 2075 } 2076 2077 if (IsMustTail && !isTailCall) 2078 report_fatal_error("failed to perform tail call elimination on a call " 2079 "site marked musttail"); 2080 2081 assert(!(isVarArg && canGuaranteeTCO(CallConv)) && 2082 "Var args not supported with calling convention fastcc, ghc or hipe"); 2083 2084 // Get a count of how many bytes are to be pushed on the stack. 2085 unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); 2086 if (IsSibcall) 2087 // This is a sibcall. The memory operands are available in caller's 2088 // own caller's stack. 2089 NumBytes = 0; 2090 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv)) 2091 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2092 2093 int FPDiff = 0; 2094 if (isTailCall && 2095 shouldGuaranteeTCO(CallConv, 2096 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2097 // Lower arguments at fp - stackoffset + fpdiff. 2098 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); 2099 2100 FPDiff = NumBytesCallerPushed - NumBytes; 2101 2102 // Set the delta of movement of the returnaddr stackslot. 2103 // But only set if delta is greater than previous delta. 2104 if (FPDiff < X86Info->getTCReturnAddrDelta()) 2105 X86Info->setTCReturnAddrDelta(FPDiff); 2106 } 2107 2108 unsigned NumBytesToPush = NumBytes; 2109 unsigned NumBytesToPop = NumBytes; 2110 2111 // If we have an inalloca argument, all stack space has already been allocated 2112 // for us and be right at the top of the stack. We don't support multiple 2113 // arguments passed in memory when using inalloca. 2114 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { 2115 NumBytesToPush = 0; 2116 if (!ArgLocs.back().isMemLoc()) 2117 report_fatal_error("cannot use inalloca attribute on a register " 2118 "parameter"); 2119 if (ArgLocs.back().getLocMemOffset() != 0) 2120 report_fatal_error("any parameter with the inalloca attribute must be " 2121 "the only memory argument"); 2122 } else if (CLI.IsPreallocated) { 2123 assert(ArgLocs.back().isMemLoc() && 2124 "cannot use preallocated attribute on a register " 2125 "parameter"); 2126 SmallVector<size_t, 4> PreallocatedOffsets; 2127 for (size_t i = 0; i < CLI.OutVals.size(); ++i) { 2128 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) { 2129 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset()); 2130 } 2131 } 2132 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>(); 2133 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB); 2134 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes); 2135 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets); 2136 NumBytesToPush = 0; 2137 } 2138 2139 if (!IsSibcall && !IsMustTail) 2140 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, 2141 NumBytes - NumBytesToPush, dl); 2142 2143 SDValue RetAddrFrIdx; 2144 // Load return address for tail calls. 2145 if (isTailCall && FPDiff) 2146 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2147 Is64Bit, FPDiff, dl); 2148 2149 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass; 2150 SmallVector<SDValue, 8> MemOpChains; 2151 SDValue StackPtr; 2152 2153 // The next loop assumes that the locations are in the same order of the 2154 // input arguments. 2155 assert(isSortedByValueNo(ArgLocs) && 2156 "Argument Location list must be sorted before lowering"); 2157 2158 // Walk the register/memloc assignments, inserting copies/loads. In the case 2159 // of tail call optimization arguments are handle later. 2160 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 2161 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; 2162 ++I, ++OutIndex) { 2163 assert(OutIndex < Outs.size() && "Invalid Out index"); 2164 // Skip inalloca/preallocated arguments, they have already been written. 2165 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; 2166 if (Flags.isInAlloca() || Flags.isPreallocated()) 2167 continue; 2168 2169 CCValAssign &VA = ArgLocs[I]; 2170 EVT RegVT = VA.getLocVT(); 2171 SDValue Arg = OutVals[OutIndex]; 2172 bool isByVal = Flags.isByVal(); 2173 2174 // Promote the value if needed. 2175 switch (VA.getLocInfo()) { 2176 default: llvm_unreachable("Unknown loc info!"); 2177 case CCValAssign::Full: break; 2178 case CCValAssign::SExt: 2179 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2180 break; 2181 case CCValAssign::ZExt: 2182 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2183 break; 2184 case CCValAssign::AExt: 2185 if (Arg.getValueType().isVector() && 2186 Arg.getValueType().getVectorElementType() == MVT::i1) 2187 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG); 2188 else if (RegVT.is128BitVector()) { 2189 // Special case: passing MMX values in XMM registers. 2190 Arg = DAG.getBitcast(MVT::i64, Arg); 2191 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2192 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2193 } else 2194 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2195 break; 2196 case CCValAssign::BCvt: 2197 Arg = DAG.getBitcast(RegVT, Arg); 2198 break; 2199 case CCValAssign::Indirect: { 2200 if (isByVal) { 2201 // Memcpy the argument to a temporary stack slot to prevent 2202 // the caller from seeing any modifications the callee may make 2203 // as guaranteed by the `byval` attribute. 2204 int FrameIdx = MF.getFrameInfo().CreateStackObject( 2205 Flags.getByValSize(), 2206 std::max(Align(16), Flags.getNonZeroByValAlign()), false); 2207 SDValue StackSlot = 2208 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout())); 2209 Chain = 2210 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl); 2211 // From now on treat this as a regular pointer 2212 Arg = StackSlot; 2213 isByVal = false; 2214 } else { 2215 // Store the argument. 2216 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2217 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2218 Chain = DAG.getStore( 2219 Chain, dl, Arg, SpillSlot, 2220 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 2221 Arg = SpillSlot; 2222 } 2223 break; 2224 } 2225 } 2226 2227 if (VA.needsCustom()) { 2228 assert(VA.getValVT() == MVT::v64i1 && 2229 "Currently the only custom case is when we split v64i1 to 2 regs"); 2230 // Split v64i1 value into two registers 2231 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget); 2232 } else if (VA.isRegLoc()) { 2233 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2234 const TargetOptions &Options = DAG.getTarget().Options; 2235 if (Options.EmitCallSiteInfo) 2236 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), I); 2237 if (isVarArg && IsWin64) { 2238 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2239 // shadow reg if callee is a varargs function. 2240 Register ShadowReg; 2241 switch (VA.getLocReg()) { 2242 case X86::XMM0: ShadowReg = X86::RCX; break; 2243 case X86::XMM1: ShadowReg = X86::RDX; break; 2244 case X86::XMM2: ShadowReg = X86::R8; break; 2245 case X86::XMM3: ShadowReg = X86::R9; break; 2246 } 2247 if (ShadowReg) 2248 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2249 } 2250 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2251 assert(VA.isMemLoc()); 2252 if (!StackPtr.getNode()) 2253 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 2254 getPointerTy(DAG.getDataLayout())); 2255 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2256 dl, DAG, VA, Flags, isByVal)); 2257 } 2258 } 2259 2260 if (!MemOpChains.empty()) 2261 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2262 2263 if (Subtarget.isPICStyleGOT()) { 2264 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2265 // GOT pointer (except regcall). 2266 if (!isTailCall) { 2267 // Indirect call with RegCall calling convertion may use up all the 2268 // general registers, so it is not suitable to bind EBX reister for 2269 // GOT address, just let register allocator handle it. 2270 if (CallConv != CallingConv::X86_RegCall) 2271 RegsToPass.push_back(std::make_pair( 2272 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), 2273 getPointerTy(DAG.getDataLayout())))); 2274 } else { 2275 // If we are tail calling and generating PIC/GOT style code load the 2276 // address of the callee into ECX. The value in ecx is used as target of 2277 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2278 // for tail calls on PIC/GOT architectures. Normally we would just put the 2279 // address of GOT into ebx and then call target@PLT. But for tail calls 2280 // ebx would be restored (since ebx is callee saved) before jumping to the 2281 // target@PLT. 2282 2283 // Note: The actual moving to ECX is done further down. 2284 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2285 if (G && !G->getGlobal()->hasLocalLinkage() && 2286 G->getGlobal()->hasDefaultVisibility()) 2287 Callee = LowerGlobalAddress(Callee, DAG); 2288 else if (isa<ExternalSymbolSDNode>(Callee)) 2289 Callee = LowerExternalSymbol(Callee, DAG); 2290 } 2291 } 2292 2293 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail && 2294 (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) { 2295 // From AMD64 ABI document: 2296 // For calls that may call functions that use varargs or stdargs 2297 // (prototype-less calls or calls to functions containing ellipsis (...) in 2298 // the declaration) %al is used as hidden argument to specify the number 2299 // of SSE registers used. The contents of %al do not need to match exactly 2300 // the number of registers, but must be an ubound on the number of SSE 2301 // registers used and is in the range 0 - 8 inclusive. 2302 2303 // Count the number of XMM registers allocated. 2304 static const MCPhysReg XMMArgRegs[] = { 2305 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2306 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2307 }; 2308 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); 2309 assert((Subtarget.hasSSE1() || !NumXMMRegs) 2310 && "SSE registers cannot be used when SSE is disabled"); 2311 RegsToPass.push_back(std::make_pair(Register(X86::AL), 2312 DAG.getConstant(NumXMMRegs, dl, 2313 MVT::i8))); 2314 } 2315 2316 if (isVarArg && IsMustTail) { 2317 const auto &Forwards = X86Info->getForwardedMustTailRegParms(); 2318 for (const auto &F : Forwards) { 2319 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); 2320 RegsToPass.push_back(std::make_pair(F.PReg, Val)); 2321 } 2322 } 2323 2324 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls 2325 // don't need this because the eligibility check rejects calls that require 2326 // shuffling arguments passed in memory. 2327 if (!IsSibcall && isTailCall) { 2328 // Force all the incoming stack arguments to be loaded from the stack 2329 // before any new outgoing arguments are stored to the stack, because the 2330 // outgoing stack slots may alias the incoming argument stack slots, and 2331 // the alias isn't otherwise explicit. This is slightly more conservative 2332 // than necessary, because it means that each store effectively depends 2333 // on every argument instead of just those arguments it would clobber. 2334 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2335 2336 SmallVector<SDValue, 8> MemOpChains2; 2337 SDValue FIN; 2338 int FI = 0; 2339 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E; 2340 ++I, ++OutsIndex) { 2341 CCValAssign &VA = ArgLocs[I]; 2342 2343 if (VA.isRegLoc()) { 2344 if (VA.needsCustom()) { 2345 assert((CallConv == CallingConv::X86_RegCall) && 2346 "Expecting custom case only in regcall calling convention"); 2347 // This means that we are in special case where one argument was 2348 // passed through two register locations - Skip the next location 2349 ++I; 2350 } 2351 2352 continue; 2353 } 2354 2355 assert(VA.isMemLoc()); 2356 SDValue Arg = OutVals[OutsIndex]; 2357 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; 2358 // Skip inalloca/preallocated arguments. They don't require any work. 2359 if (Flags.isInAlloca() || Flags.isPreallocated()) 2360 continue; 2361 // Create frame index. 2362 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2363 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2364 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); 2365 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 2366 2367 if (Flags.isByVal()) { 2368 // Copy relative to framepointer. 2369 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); 2370 if (!StackPtr.getNode()) 2371 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 2372 getPointerTy(DAG.getDataLayout())); 2373 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2374 StackPtr, Source); 2375 2376 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2377 ArgChain, 2378 Flags, DAG, dl)); 2379 } else { 2380 // Store relative to framepointer. 2381 MemOpChains2.push_back(DAG.getStore( 2382 ArgChain, dl, Arg, FIN, 2383 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); 2384 } 2385 } 2386 2387 if (!MemOpChains2.empty()) 2388 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); 2389 2390 // Store the return address to the appropriate stack slot. 2391 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, 2392 getPointerTy(DAG.getDataLayout()), 2393 RegInfo->getSlotSize(), FPDiff, dl); 2394 } 2395 2396 // Build a sequence of copy-to-reg nodes chained together with token chain 2397 // and glue operands which copy the outgoing args into registers. 2398 SDValue InGlue; 2399 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2400 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2401 RegsToPass[i].second, InGlue); 2402 InGlue = Chain.getValue(1); 2403 } 2404 2405 if (DAG.getTarget().getCodeModel() == CodeModel::Large) { 2406 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2407 // In the 64-bit large code model, we have to make all calls 2408 // through a register, since the call instruction's 32-bit 2409 // pc-relative offset may not be large enough to hold the whole 2410 // address. 2411 } else if (Callee->getOpcode() == ISD::GlobalAddress || 2412 Callee->getOpcode() == ISD::ExternalSymbol) { 2413 // Lower direct calls to global addresses and external symbols. Setting 2414 // ForCall to true here has the effect of removing WrapperRIP when possible 2415 // to allow direct calls to be selected without first materializing the 2416 // address into a register. 2417 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true); 2418 } else if (Subtarget.isTarget64BitILP32() && 2419 Callee.getValueType() == MVT::i32) { 2420 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI 2421 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); 2422 } 2423 2424 SmallVector<SDValue, 8> Ops; 2425 2426 if (!IsSibcall && isTailCall && !IsMustTail) { 2427 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl); 2428 InGlue = Chain.getValue(1); 2429 } 2430 2431 Ops.push_back(Chain); 2432 Ops.push_back(Callee); 2433 2434 if (isTailCall) 2435 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, dl, MVT::i32)); 2436 2437 // Add argument registers to the end of the list so that they are known live 2438 // into the call. 2439 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2440 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2441 RegsToPass[i].second.getValueType())); 2442 2443 // Add a register mask operand representing the call-preserved registers. 2444 const uint32_t *Mask = [&]() { 2445 auto AdaptedCC = CallConv; 2446 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists), 2447 // use X86_INTR calling convention because it has the same CSR mask 2448 // (same preserved registers). 2449 if (HasNCSR) 2450 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR; 2451 // If NoCalleeSavedRegisters is requested, than use GHC since it happens 2452 // to use the CSR_NoRegs_RegMask. 2453 if (CB && CB->hasFnAttr("no_callee_saved_registers")) 2454 AdaptedCC = (CallingConv::ID)CallingConv::GHC; 2455 return RegInfo->getCallPreservedMask(MF, AdaptedCC); 2456 }(); 2457 assert(Mask && "Missing call preserved mask for calling convention"); 2458 2459 if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getFramePtr())) { 2460 X86Info->setFPClobberedByCall(true); 2461 if (CLI.CB && isa<InvokeInst>(CLI.CB)) 2462 X86Info->setFPClobberedByInvoke(true); 2463 } 2464 if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getBaseRegister())) { 2465 X86Info->setBPClobberedByCall(true); 2466 if (CLI.CB && isa<InvokeInst>(CLI.CB)) 2467 X86Info->setBPClobberedByInvoke(true); 2468 } 2469 2470 // If this is an invoke in a 32-bit function using a funclet-based 2471 // personality, assume the function clobbers all registers. If an exception 2472 // is thrown, the runtime will not restore CSRs. 2473 // FIXME: Model this more precisely so that we can register allocate across 2474 // the normal edge and spill and fill across the exceptional edge. 2475 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) { 2476 const Function &CallerFn = MF.getFunction(); 2477 EHPersonality Pers = 2478 CallerFn.hasPersonalityFn() 2479 ? classifyEHPersonality(CallerFn.getPersonalityFn()) 2480 : EHPersonality::Unknown; 2481 if (isFuncletEHPersonality(Pers)) 2482 Mask = RegInfo->getNoPreservedMask(); 2483 } 2484 2485 // Define a new register mask from the existing mask. 2486 uint32_t *RegMask = nullptr; 2487 2488 // In some calling conventions we need to remove the used physical registers 2489 // from the reg mask. Create a new RegMask for such calling conventions. 2490 // RegMask for calling conventions that disable only return registers (e.g. 2491 // preserve_most) will be modified later in LowerCallResult. 2492 bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR; 2493 if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) { 2494 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); 2495 2496 // Allocate a new Reg Mask and copy Mask. 2497 RegMask = MF.allocateRegMask(); 2498 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); 2499 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize); 2500 2501 // Make sure all sub registers of the argument registers are reset 2502 // in the RegMask. 2503 if (ShouldDisableArgRegs) { 2504 for (auto const &RegPair : RegsToPass) 2505 for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first)) 2506 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32)); 2507 } 2508 2509 // Create the RegMask Operand according to our updated mask. 2510 Ops.push_back(DAG.getRegisterMask(RegMask)); 2511 } else { 2512 // Create the RegMask Operand according to the static mask. 2513 Ops.push_back(DAG.getRegisterMask(Mask)); 2514 } 2515 2516 if (InGlue.getNode()) 2517 Ops.push_back(InGlue); 2518 2519 if (isTailCall) { 2520 // We used to do: 2521 //// If this is the first return lowered for this function, add the regs 2522 //// to the liveout set for the function. 2523 // This isn't right, although it's probably harmless on x86; liveouts 2524 // should be computed from returns not tail calls. Consider a void 2525 // function making a tail call to a function returning int. 2526 MF.getFrameInfo().setHasTailCall(); 2527 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, MVT::Other, Ops); 2528 2529 if (IsCFICall) 2530 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); 2531 2532 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); 2533 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 2534 return Ret; 2535 } 2536 2537 // Returns a chain & a glue for retval copy to use. 2538 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2539 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) { 2540 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops); 2541 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { 2542 // Calls with a "clang.arc.attachedcall" bundle are special. They should be 2543 // expanded to the call, directly followed by a special marker sequence and 2544 // a call to a ObjC library function. Use the CALL_RVMARKER to do that. 2545 assert(!isTailCall && 2546 "tail calls cannot be marked with clang.arc.attachedcall"); 2547 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"); 2548 2549 // Add a target global address for the retainRV/claimRV runtime function 2550 // just before the call target. 2551 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB); 2552 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2553 auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT); 2554 Ops.insert(Ops.begin() + 1, GA); 2555 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops); 2556 } else { 2557 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); 2558 } 2559 2560 if (IsCFICall) 2561 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); 2562 2563 InGlue = Chain.getValue(1); 2564 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); 2565 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 2566 2567 // Save heapallocsite metadata. 2568 if (CLI.CB) 2569 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite")) 2570 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc); 2571 2572 // Create the CALLSEQ_END node. 2573 unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing. 2574 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2575 DAG.getTarget().Options.GuaranteedTailCallOpt)) 2576 NumBytesForCalleeToPop = NumBytes; // Callee pops everything 2577 else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet) 2578 // If this call passes a struct-return pointer, the callee 2579 // pops that struct pointer. 2580 NumBytesForCalleeToPop = 4; 2581 2582 // Returns a glue for retval copy to use. 2583 if (!IsSibcall) { 2584 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop, 2585 InGlue, dl); 2586 InGlue = Chain.getValue(1); 2587 } 2588 2589 if (CallingConv::PreserveNone == CallConv) 2590 for (unsigned I = 0, E = Outs.size(); I != E; ++I) { 2591 if (Outs[I].Flags.isSwiftSelf() || Outs[I].Flags.isSwiftAsync() || 2592 Outs[I].Flags.isSwiftError()) { 2593 errorUnsupported(DAG, dl, 2594 "Swift attributes can't be used with preserve_none"); 2595 break; 2596 } 2597 } 2598 2599 // Handle result values, copying them out of physregs into vregs that we 2600 // return. 2601 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG, 2602 InVals, RegMask); 2603 } 2604 2605 //===----------------------------------------------------------------------===// 2606 // Fast Calling Convention (tail call) implementation 2607 //===----------------------------------------------------------------------===// 2608 2609 // Like std call, callee cleans arguments, convention except that ECX is 2610 // reserved for storing the tail called function address. Only 2 registers are 2611 // free for argument passing (inreg). Tail call optimization is performed 2612 // provided: 2613 // * tailcallopt is enabled 2614 // * caller/callee are fastcc 2615 // On X86_64 architecture with GOT-style position independent code only local 2616 // (within module) calls are supported at the moment. 2617 // To keep the stack aligned according to platform abi the function 2618 // GetAlignedArgumentStackSize ensures that argument delta is always multiples 2619 // of stack alignment. (Dynamic linkers need this - Darwin's dyld for example) 2620 // If a tail called function callee has more arguments than the caller the 2621 // caller needs to make sure that there is room to move the RETADDR to. This is 2622 // achieved by reserving an area the size of the argument delta right after the 2623 // original RETADDR, but before the saved framepointer or the spilled registers 2624 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2625 // stack layout: 2626 // arg1 2627 // arg2 2628 // RETADDR 2629 // [ new RETADDR 2630 // move area ] 2631 // (possible EBP) 2632 // ESI 2633 // EDI 2634 // local1 .. 2635 2636 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align 2637 /// requirement. 2638 unsigned 2639 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize, 2640 SelectionDAG &DAG) const { 2641 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign(); 2642 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize(); 2643 assert(StackSize % SlotSize == 0 && 2644 "StackSize must be a multiple of SlotSize"); 2645 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize; 2646 } 2647 2648 /// Return true if the given stack call argument is already available in the 2649 /// same position (relatively) of the caller's incoming argument stack. 2650 static 2651 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2652 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2653 const X86InstrInfo *TII, const CCValAssign &VA) { 2654 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2655 2656 for (;;) { 2657 // Look through nodes that don't alter the bits of the incoming value. 2658 unsigned Op = Arg.getOpcode(); 2659 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST || 2660 Op == ISD::AssertZext) { 2661 Arg = Arg.getOperand(0); 2662 continue; 2663 } 2664 if (Op == ISD::TRUNCATE) { 2665 const SDValue &TruncInput = Arg.getOperand(0); 2666 if (TruncInput.getOpcode() == ISD::AssertZext && 2667 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() == 2668 Arg.getValueType()) { 2669 Arg = TruncInput.getOperand(0); 2670 continue; 2671 } 2672 } 2673 break; 2674 } 2675 2676 int FI = INT_MAX; 2677 if (Arg.getOpcode() == ISD::CopyFromReg) { 2678 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2679 if (!VR.isVirtual()) 2680 return false; 2681 MachineInstr *Def = MRI->getVRegDef(VR); 2682 if (!Def) 2683 return false; 2684 if (!Flags.isByVal()) { 2685 if (!TII->isLoadFromStackSlot(*Def, FI)) 2686 return false; 2687 } else { 2688 unsigned Opcode = Def->getOpcode(); 2689 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || 2690 Opcode == X86::LEA64_32r) && 2691 Def->getOperand(1).isFI()) { 2692 FI = Def->getOperand(1).getIndex(); 2693 Bytes = Flags.getByValSize(); 2694 } else 2695 return false; 2696 } 2697 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2698 if (Flags.isByVal()) 2699 // ByVal argument is passed in as a pointer but it's now being 2700 // dereferenced. e.g. 2701 // define @foo(%struct.X* %A) { 2702 // tail call @bar(%struct.X* byval %A) 2703 // } 2704 return false; 2705 SDValue Ptr = Ld->getBasePtr(); 2706 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2707 if (!FINode) 2708 return false; 2709 FI = FINode->getIndex(); 2710 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2711 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2712 FI = FINode->getIndex(); 2713 Bytes = Flags.getByValSize(); 2714 } else 2715 return false; 2716 2717 assert(FI != INT_MAX); 2718 if (!MFI.isFixedObjectIndex(FI)) 2719 return false; 2720 2721 if (Offset != MFI.getObjectOffset(FI)) 2722 return false; 2723 2724 // If this is not byval, check that the argument stack object is immutable. 2725 // inalloca and argument copy elision can create mutable argument stack 2726 // objects. Byval objects can be mutated, but a byval call intends to pass the 2727 // mutated memory. 2728 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) 2729 return false; 2730 2731 if (VA.getLocVT().getFixedSizeInBits() > 2732 Arg.getValueSizeInBits().getFixedValue()) { 2733 // If the argument location is wider than the argument type, check that any 2734 // extension flags match. 2735 if (Flags.isZExt() != MFI.isObjectZExt(FI) || 2736 Flags.isSExt() != MFI.isObjectSExt(FI)) { 2737 return false; 2738 } 2739 } 2740 2741 return Bytes == MFI.getObjectSize(FI); 2742 } 2743 2744 /// Check whether the call is eligible for tail call optimization. Targets 2745 /// that want to do tail call optimization should implement this function. 2746 /// Note that the x86 backend does not check musttail calls for eligibility! The 2747 /// rest of x86 tail call lowering must be prepared to forward arguments of any 2748 /// type. 2749 bool X86TargetLowering::IsEligibleForTailCallOptimization( 2750 TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo, 2751 SmallVectorImpl<CCValAssign> &ArgLocs, bool IsCalleePopSRet) const { 2752 SelectionDAG &DAG = CLI.DAG; 2753 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2754 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2755 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2756 SDValue Callee = CLI.Callee; 2757 CallingConv::ID CalleeCC = CLI.CallConv; 2758 bool isVarArg = CLI.IsVarArg; 2759 2760 if (!mayTailCallThisCC(CalleeCC)) 2761 return false; 2762 2763 // If -tailcallopt is specified, make fastcc functions tail-callable. 2764 MachineFunction &MF = DAG.getMachineFunction(); 2765 const Function &CallerF = MF.getFunction(); 2766 2767 // If the function return type is x86_fp80 and the callee return type is not, 2768 // then the FP_EXTEND of the call result is not a nop. It's not safe to 2769 // perform a tailcall optimization here. 2770 if (CallerF.getReturnType()->isX86_FP80Ty() && !CLI.RetTy->isX86_FP80Ty()) 2771 return false; 2772 2773 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2774 bool CCMatch = CallerCC == CalleeCC; 2775 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); 2776 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); 2777 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt || 2778 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail; 2779 2780 // Win64 functions have extra shadow space for argument homing. Don't do the 2781 // sibcall if the caller and callee have mismatched expectations for this 2782 // space. 2783 if (IsCalleeWin64 != IsCallerWin64) 2784 return false; 2785 2786 if (IsGuaranteeTCO) { 2787 if (canGuaranteeTCO(CalleeCC) && CCMatch) 2788 return true; 2789 return false; 2790 } 2791 2792 // Look for obvious safe cases to perform tail call optimization that do not 2793 // require ABI changes. This is what gcc calls sibcall. 2794 2795 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2796 // emit a special epilogue. 2797 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 2798 if (RegInfo->hasStackRealignment(MF)) 2799 return false; 2800 2801 // Also avoid sibcall optimization if we're an sret return fn and the callee 2802 // is incompatible. See comment in LowerReturn about why hasStructRetAttr is 2803 // insufficient. 2804 if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) { 2805 // For a compatible tail call the callee must return our sret pointer. So it 2806 // needs to be (a) an sret function itself and (b) we pass our sret as its 2807 // sret. Condition #b is harder to determine. 2808 return false; 2809 } else if (IsCalleePopSRet) 2810 // The callee pops an sret, so we cannot tail-call, as our caller doesn't 2811 // expect that. 2812 return false; 2813 2814 // Do not sibcall optimize vararg calls unless all arguments are passed via 2815 // registers. 2816 LLVMContext &C = *DAG.getContext(); 2817 if (isVarArg && !Outs.empty()) { 2818 // Optimizing for varargs on Win64 is unlikely to be safe without 2819 // additional testing. 2820 if (IsCalleeWin64 || IsCallerWin64) 2821 return false; 2822 2823 for (const auto &VA : ArgLocs) 2824 if (!VA.isRegLoc()) 2825 return false; 2826 } 2827 2828 // If the call result is in ST0 / ST1, it needs to be popped off the x87 2829 // stack. Therefore, if it's not used by the call it is not safe to optimize 2830 // this into a sibcall. 2831 bool Unused = false; 2832 for (const auto &In : Ins) { 2833 if (!In.Used) { 2834 Unused = true; 2835 break; 2836 } 2837 } 2838 if (Unused) { 2839 SmallVector<CCValAssign, 16> RVLocs; 2840 CCState RVCCInfo(CalleeCC, false, MF, RVLocs, C); 2841 RVCCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2842 for (const auto &VA : RVLocs) { 2843 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) 2844 return false; 2845 } 2846 } 2847 2848 // Check that the call results are passed in the same way. 2849 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2850 RetCC_X86, RetCC_X86)) 2851 return false; 2852 // The callee has to preserve all registers the caller needs to preserve. 2853 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); 2854 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2855 if (!CCMatch) { 2856 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2857 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2858 return false; 2859 } 2860 2861 // The stack frame of the caller cannot be replaced by the tail-callee one's 2862 // if the function is required to preserve all the registers. Conservatively 2863 // prevent tail optimization even if hypothetically all the registers are used 2864 // for passing formal parameters or returning values. 2865 if (CallerF.hasFnAttribute("no_caller_saved_registers")) 2866 return false; 2867 2868 unsigned StackArgsSize = CCInfo.getStackSize(); 2869 2870 // If the callee takes no arguments then go on to check the results of the 2871 // call. 2872 if (!Outs.empty()) { 2873 if (StackArgsSize > 0) { 2874 // Check if the arguments are already laid out in the right way as 2875 // the caller's fixed stack objects. 2876 MachineFrameInfo &MFI = MF.getFrameInfo(); 2877 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2878 const X86InstrInfo *TII = Subtarget.getInstrInfo(); 2879 for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) { 2880 const CCValAssign &VA = ArgLocs[I]; 2881 SDValue Arg = OutVals[I]; 2882 ISD::ArgFlagsTy Flags = Outs[I].Flags; 2883 if (VA.getLocInfo() == CCValAssign::Indirect) 2884 return false; 2885 if (!VA.isRegLoc()) { 2886 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, 2887 TII, VA)) 2888 return false; 2889 } 2890 } 2891 } 2892 2893 bool PositionIndependent = isPositionIndependent(); 2894 // If the tailcall address may be in a register, then make sure it's 2895 // possible to register allocate for it. In 32-bit, the call address can 2896 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2897 // callee-saved registers are restored. These happen to be the same 2898 // registers used to pass 'inreg' arguments so watch out for those. 2899 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) && 2900 !isa<ExternalSymbolSDNode>(Callee)) || 2901 PositionIndependent)) { 2902 unsigned NumInRegs = 0; 2903 // In PIC we need an extra register to formulate the address computation 2904 // for the callee. 2905 unsigned MaxInRegs = PositionIndependent ? 2 : 3; 2906 2907 for (const auto &VA : ArgLocs) { 2908 if (!VA.isRegLoc()) 2909 continue; 2910 Register Reg = VA.getLocReg(); 2911 switch (Reg) { 2912 default: break; 2913 case X86::EAX: case X86::EDX: case X86::ECX: 2914 if (++NumInRegs == MaxInRegs) 2915 return false; 2916 break; 2917 } 2918 } 2919 } 2920 2921 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2922 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2923 return false; 2924 } 2925 2926 bool CalleeWillPop = 2927 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, 2928 MF.getTarget().Options.GuaranteedTailCallOpt); 2929 2930 if (unsigned BytesToPop = 2931 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) { 2932 // If we have bytes to pop, the callee must pop them. 2933 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; 2934 if (!CalleePopMatches) 2935 return false; 2936 } else if (CalleeWillPop && StackArgsSize > 0) { 2937 // If we don't have bytes to pop, make sure the callee doesn't pop any. 2938 return false; 2939 } 2940 2941 return true; 2942 } 2943 2944 /// Determines whether the callee is required to pop its own arguments. 2945 /// Callee pop is necessary to support tail calls. 2946 bool X86::isCalleePop(CallingConv::ID CallingConv, 2947 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { 2948 // If GuaranteeTCO is true, we force some calls to be callee pop so that we 2949 // can guarantee TCO. 2950 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) 2951 return true; 2952 2953 switch (CallingConv) { 2954 default: 2955 return false; 2956 case CallingConv::X86_StdCall: 2957 case CallingConv::X86_FastCall: 2958 case CallingConv::X86_ThisCall: 2959 case CallingConv::X86_VectorCall: 2960 return !is64Bit; 2961 } 2962 } 2963