109467b48Spatrick //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
209467b48Spatrick //
309467b48Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
409467b48Spatrick // See https://llvm.org/LICENSE.txt for license information.
509467b48Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
609467b48Spatrick //
709467b48Spatrick //===----------------------------------------------------------------------===//
809467b48Spatrick //
909467b48Spatrick // This file implements the ARMSelectionDAGInfo class.
1009467b48Spatrick //
1109467b48Spatrick //===----------------------------------------------------------------------===//
1209467b48Spatrick
1309467b48Spatrick #include "ARMTargetMachine.h"
1473471bf0Spatrick #include "ARMTargetTransformInfo.h"
1509467b48Spatrick #include "llvm/CodeGen/SelectionDAG.h"
1609467b48Spatrick #include "llvm/IR/DerivedTypes.h"
1773471bf0Spatrick #include "llvm/Support/CommandLine.h"
1809467b48Spatrick using namespace llvm;
1909467b48Spatrick
2009467b48Spatrick #define DEBUG_TYPE "arm-selectiondag-info"
2109467b48Spatrick
2273471bf0Spatrick cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
2373471bf0Spatrick "arm-memtransfer-tploop", cl::Hidden,
2473471bf0Spatrick cl::desc("Control conversion of memcpy to "
2573471bf0Spatrick "Tail predicated loops (WLSTP)"),
2673471bf0Spatrick cl::init(TPLoop::ForceDisabled),
2773471bf0Spatrick cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
2873471bf0Spatrick "Don't convert memcpy to TP loop."),
2973471bf0Spatrick clEnumValN(TPLoop::ForceEnabled, "force-enabled",
3073471bf0Spatrick "Always convert memcpy to TP loop."),
3173471bf0Spatrick clEnumValN(TPLoop::Allow, "allow",
3273471bf0Spatrick "Allow (may be subject to certain conditions) "
3373471bf0Spatrick "conversion of memcpy to TP loop.")));
3473471bf0Spatrick
3509467b48Spatrick // Emit, if possible, a specialized version of the given Libcall. Typically this
3609467b48Spatrick // means selecting the appropriately aligned version, but we also convert memset
3709467b48Spatrick // of 0 into memclr.
EmitSpecializedLibcall(SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,SDValue Size,unsigned Align,RTLIB::Libcall LC) const3809467b48Spatrick SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
3909467b48Spatrick SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
4009467b48Spatrick SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
4109467b48Spatrick const ARMSubtarget &Subtarget =
4209467b48Spatrick DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
4309467b48Spatrick const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
4409467b48Spatrick
4509467b48Spatrick // Only use a specialized AEABI function if the default version of this
4609467b48Spatrick // Libcall is an AEABI function.
4709467b48Spatrick if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
4809467b48Spatrick return SDValue();
4909467b48Spatrick
5009467b48Spatrick // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
5109467b48Spatrick // able to translate memset to memclr and use the value to index the function
5209467b48Spatrick // name array.
5309467b48Spatrick enum {
5409467b48Spatrick AEABI_MEMCPY = 0,
5509467b48Spatrick AEABI_MEMMOVE,
5609467b48Spatrick AEABI_MEMSET,
5709467b48Spatrick AEABI_MEMCLR
5809467b48Spatrick } AEABILibcall;
5909467b48Spatrick switch (LC) {
6009467b48Spatrick case RTLIB::MEMCPY:
6109467b48Spatrick AEABILibcall = AEABI_MEMCPY;
6209467b48Spatrick break;
6309467b48Spatrick case RTLIB::MEMMOVE:
6409467b48Spatrick AEABILibcall = AEABI_MEMMOVE;
6509467b48Spatrick break;
6609467b48Spatrick case RTLIB::MEMSET:
6709467b48Spatrick AEABILibcall = AEABI_MEMSET;
6809467b48Spatrick if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
6909467b48Spatrick if (ConstantSrc->getZExtValue() == 0)
7009467b48Spatrick AEABILibcall = AEABI_MEMCLR;
7109467b48Spatrick break;
7209467b48Spatrick default:
7309467b48Spatrick return SDValue();
7409467b48Spatrick }
7509467b48Spatrick
7609467b48Spatrick // Choose the most-aligned libcall variant that we can
7709467b48Spatrick enum {
7809467b48Spatrick ALIGN1 = 0,
7909467b48Spatrick ALIGN4,
8009467b48Spatrick ALIGN8
8109467b48Spatrick } AlignVariant;
8209467b48Spatrick if ((Align & 7) == 0)
8309467b48Spatrick AlignVariant = ALIGN8;
8409467b48Spatrick else if ((Align & 3) == 0)
8509467b48Spatrick AlignVariant = ALIGN4;
8609467b48Spatrick else
8709467b48Spatrick AlignVariant = ALIGN1;
8809467b48Spatrick
8909467b48Spatrick TargetLowering::ArgListTy Args;
9009467b48Spatrick TargetLowering::ArgListEntry Entry;
9109467b48Spatrick Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
9209467b48Spatrick Entry.Node = Dst;
9309467b48Spatrick Args.push_back(Entry);
9409467b48Spatrick if (AEABILibcall == AEABI_MEMCLR) {
9509467b48Spatrick Entry.Node = Size;
9609467b48Spatrick Args.push_back(Entry);
9709467b48Spatrick } else if (AEABILibcall == AEABI_MEMSET) {
9809467b48Spatrick // Adjust parameters for memset, EABI uses format (ptr, size, value),
9909467b48Spatrick // GNU library uses (ptr, value, size)
10009467b48Spatrick // See RTABI section 4.3.4
10109467b48Spatrick Entry.Node = Size;
10209467b48Spatrick Args.push_back(Entry);
10309467b48Spatrick
10409467b48Spatrick // Extend or truncate the argument to be an i32 value for the call.
10509467b48Spatrick if (Src.getValueType().bitsGT(MVT::i32))
10609467b48Spatrick Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
10709467b48Spatrick else if (Src.getValueType().bitsLT(MVT::i32))
10809467b48Spatrick Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
10909467b48Spatrick
11009467b48Spatrick Entry.Node = Src;
11109467b48Spatrick Entry.Ty = Type::getInt32Ty(*DAG.getContext());
11209467b48Spatrick Entry.IsSExt = false;
11309467b48Spatrick Args.push_back(Entry);
11409467b48Spatrick } else {
11509467b48Spatrick Entry.Node = Src;
11609467b48Spatrick Args.push_back(Entry);
11709467b48Spatrick
11809467b48Spatrick Entry.Node = Size;
11909467b48Spatrick Args.push_back(Entry);
12009467b48Spatrick }
12109467b48Spatrick
12209467b48Spatrick char const *FunctionNames[4][3] = {
12309467b48Spatrick { "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
12409467b48Spatrick { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
12509467b48Spatrick { "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
12609467b48Spatrick { "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
12709467b48Spatrick };
12809467b48Spatrick TargetLowering::CallLoweringInfo CLI(DAG);
12909467b48Spatrick CLI.setDebugLoc(dl)
13009467b48Spatrick .setChain(Chain)
13109467b48Spatrick .setLibCallee(
13209467b48Spatrick TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
13309467b48Spatrick DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
13409467b48Spatrick TLI->getPointerTy(DAG.getDataLayout())),
13509467b48Spatrick std::move(Args))
13609467b48Spatrick .setDiscardResult();
13709467b48Spatrick std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
13809467b48Spatrick
13909467b48Spatrick return CallResult.second;
14009467b48Spatrick }
14109467b48Spatrick
shouldGenerateInlineTPLoop(const ARMSubtarget & Subtarget,const SelectionDAG & DAG,ConstantSDNode * ConstantSize,Align Alignment,bool IsMemcpy)14273471bf0Spatrick static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
14373471bf0Spatrick const SelectionDAG &DAG,
14473471bf0Spatrick ConstantSDNode *ConstantSize,
14573471bf0Spatrick Align Alignment, bool IsMemcpy) {
14673471bf0Spatrick auto &F = DAG.getMachineFunction().getFunction();
14773471bf0Spatrick if (!EnableMemtransferTPLoop)
14873471bf0Spatrick return false;
14973471bf0Spatrick if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
15073471bf0Spatrick return true;
15173471bf0Spatrick // Do not generate inline TP loop if optimizations is disabled,
15273471bf0Spatrick // or if optimization for size (-Os or -Oz) is on.
15373471bf0Spatrick if (F.hasOptNone() || F.hasOptSize())
15473471bf0Spatrick return false;
15573471bf0Spatrick // If cli option is unset, for memset always generate inline TP.
15673471bf0Spatrick // For memcpy, check some conditions
15773471bf0Spatrick if (!IsMemcpy)
15873471bf0Spatrick return true;
15973471bf0Spatrick if (!ConstantSize && Alignment >= Align(4))
16073471bf0Spatrick return true;
16173471bf0Spatrick if (ConstantSize &&
16273471bf0Spatrick ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
16373471bf0Spatrick ConstantSize->getZExtValue() <
16473471bf0Spatrick Subtarget.getMaxMemcpyTPInlineSizeThreshold())
16573471bf0Spatrick return true;
16673471bf0Spatrick return false;
16773471bf0Spatrick }
16873471bf0Spatrick
EmitTargetCodeForMemcpy(SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,SDValue Size,Align Alignment,bool isVolatile,bool AlwaysInline,MachinePointerInfo DstPtrInfo,MachinePointerInfo SrcPtrInfo) const16909467b48Spatrick SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
17009467b48Spatrick SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
171097a140dSpatrick SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
17209467b48Spatrick MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
17309467b48Spatrick const ARMSubtarget &Subtarget =
17409467b48Spatrick DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
17573471bf0Spatrick ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
17673471bf0Spatrick
17773471bf0Spatrick if (Subtarget.hasMVEIntegerOps() &&
17873471bf0Spatrick shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
17973471bf0Spatrick return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
18073471bf0Spatrick DAG.getZExtOrTrunc(Size, dl, MVT::i32));
18173471bf0Spatrick
18209467b48Spatrick // Do repeated 4-byte loads and stores. To be improved.
18309467b48Spatrick // This requires 4-byte alignment.
184097a140dSpatrick if (Alignment < Align(4))
18509467b48Spatrick return SDValue();
18609467b48Spatrick // This requires the copy size to be a constant, preferably
18709467b48Spatrick // within a subtarget-specific limit.
18809467b48Spatrick if (!ConstantSize)
189097a140dSpatrick return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
190097a140dSpatrick Alignment.value(), RTLIB::MEMCPY);
19109467b48Spatrick uint64_t SizeVal = ConstantSize->getZExtValue();
19209467b48Spatrick if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
193097a140dSpatrick return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
194097a140dSpatrick Alignment.value(), RTLIB::MEMCPY);
19509467b48Spatrick
19609467b48Spatrick unsigned BytesLeft = SizeVal & 3;
19709467b48Spatrick unsigned NumMemOps = SizeVal >> 2;
19809467b48Spatrick unsigned EmittedNumMemOps = 0;
19909467b48Spatrick EVT VT = MVT::i32;
20009467b48Spatrick unsigned VTSize = 4;
20109467b48Spatrick unsigned i = 0;
20209467b48Spatrick // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
20309467b48Spatrick const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
20409467b48Spatrick SDValue TFOps[6];
20509467b48Spatrick SDValue Loads[6];
20609467b48Spatrick uint64_t SrcOff = 0, DstOff = 0;
20709467b48Spatrick
20809467b48Spatrick // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
20909467b48Spatrick // VLDM/VSTM and make this code emit it when appropriate. This would reduce
21009467b48Spatrick // pressure on the general purpose registers. However this seems harder to map
21109467b48Spatrick // onto the register allocator's view of the world.
21209467b48Spatrick
21309467b48Spatrick // The number of MEMCPY pseudo-instructions to emit. We use up to
21409467b48Spatrick // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
21509467b48Spatrick // later on. This is a lower bound on the number of MEMCPY operations we must
21609467b48Spatrick // emit.
21709467b48Spatrick unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
21809467b48Spatrick
21909467b48Spatrick // Code size optimisation: do not inline memcpy if expansion results in
22009467b48Spatrick // more instructions than the libary call.
22109467b48Spatrick if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
22209467b48Spatrick return SDValue();
22309467b48Spatrick }
22409467b48Spatrick
22509467b48Spatrick SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
22609467b48Spatrick
22709467b48Spatrick for (unsigned I = 0; I != NumMEMCPYs; ++I) {
22809467b48Spatrick // Evenly distribute registers among MEMCPY operations to reduce register
22909467b48Spatrick // pressure.
23009467b48Spatrick unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
23109467b48Spatrick unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
23209467b48Spatrick
23309467b48Spatrick Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
23409467b48Spatrick DAG.getConstant(NumRegs, dl, MVT::i32));
23509467b48Spatrick Src = Dst.getValue(1);
23609467b48Spatrick Chain = Dst.getValue(2);
23709467b48Spatrick
23809467b48Spatrick DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
23909467b48Spatrick SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
24009467b48Spatrick
24109467b48Spatrick EmittedNumMemOps = NextEmittedNumMemOps;
24209467b48Spatrick }
24309467b48Spatrick
24409467b48Spatrick if (BytesLeft == 0)
24509467b48Spatrick return Chain;
24609467b48Spatrick
24709467b48Spatrick // Issue loads / stores for the trailing (1 - 3) bytes.
24809467b48Spatrick auto getRemainingValueType = [](unsigned BytesLeft) {
24909467b48Spatrick return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
25009467b48Spatrick };
25109467b48Spatrick auto getRemainingSize = [](unsigned BytesLeft) {
25209467b48Spatrick return (BytesLeft >= 2) ? 2 : 1;
25309467b48Spatrick };
25409467b48Spatrick
25509467b48Spatrick unsigned BytesLeftSave = BytesLeft;
25609467b48Spatrick i = 0;
25709467b48Spatrick while (BytesLeft) {
25809467b48Spatrick VT = getRemainingValueType(BytesLeft);
25909467b48Spatrick VTSize = getRemainingSize(BytesLeft);
26009467b48Spatrick Loads[i] = DAG.getLoad(VT, dl, Chain,
26109467b48Spatrick DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
26209467b48Spatrick DAG.getConstant(SrcOff, dl, MVT::i32)),
26309467b48Spatrick SrcPtrInfo.getWithOffset(SrcOff));
26409467b48Spatrick TFOps[i] = Loads[i].getValue(1);
26509467b48Spatrick ++i;
26609467b48Spatrick SrcOff += VTSize;
26709467b48Spatrick BytesLeft -= VTSize;
26809467b48Spatrick }
269*d415bd75Srobert Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
27009467b48Spatrick
27109467b48Spatrick i = 0;
27209467b48Spatrick BytesLeft = BytesLeftSave;
27309467b48Spatrick while (BytesLeft) {
27409467b48Spatrick VT = getRemainingValueType(BytesLeft);
27509467b48Spatrick VTSize = getRemainingSize(BytesLeft);
27609467b48Spatrick TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
27709467b48Spatrick DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
27809467b48Spatrick DAG.getConstant(DstOff, dl, MVT::i32)),
27909467b48Spatrick DstPtrInfo.getWithOffset(DstOff));
28009467b48Spatrick ++i;
28109467b48Spatrick DstOff += VTSize;
28209467b48Spatrick BytesLeft -= VTSize;
28309467b48Spatrick }
284*d415bd75Srobert return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
28509467b48Spatrick }
28609467b48Spatrick
EmitTargetCodeForMemmove(SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,SDValue Size,Align Alignment,bool isVolatile,MachinePointerInfo DstPtrInfo,MachinePointerInfo SrcPtrInfo) const28709467b48Spatrick SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
28809467b48Spatrick SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
289097a140dSpatrick SDValue Size, Align Alignment, bool isVolatile,
29009467b48Spatrick MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
291097a140dSpatrick return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
292097a140dSpatrick Alignment.value(), RTLIB::MEMMOVE);
29309467b48Spatrick }
29409467b48Spatrick
EmitTargetCodeForMemset(SelectionDAG & DAG,const SDLoc & dl,SDValue Chain,SDValue Dst,SDValue Src,SDValue Size,Align Alignment,bool isVolatile,bool AlwaysInline,MachinePointerInfo DstPtrInfo) const29509467b48Spatrick SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
29609467b48Spatrick SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
297*d415bd75Srobert SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
29809467b48Spatrick MachinePointerInfo DstPtrInfo) const {
29973471bf0Spatrick
30073471bf0Spatrick const ARMSubtarget &Subtarget =
30173471bf0Spatrick DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
30273471bf0Spatrick
30373471bf0Spatrick ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
30473471bf0Spatrick
30573471bf0Spatrick // Generate TP loop for llvm.memset
30673471bf0Spatrick if (Subtarget.hasMVEIntegerOps() &&
30773471bf0Spatrick shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
30873471bf0Spatrick false)) {
30973471bf0Spatrick Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
31073471bf0Spatrick DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
31173471bf0Spatrick return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
31273471bf0Spatrick DAG.getZExtOrTrunc(Size, dl, MVT::i32));
31373471bf0Spatrick }
31473471bf0Spatrick
315*d415bd75Srobert if (!AlwaysInline)
316097a140dSpatrick return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
317097a140dSpatrick Alignment.value(), RTLIB::MEMSET);
318*d415bd75Srobert
319*d415bd75Srobert return SDValue();
32009467b48Spatrick }
321