xref: /llvm-project/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp (revision 6292a808b3524d9ba6f4ce55bc5b9e547b088dd8)
1 //===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
10 #include "llvm/Analysis/ScalarEvolution.h"
11 #include "llvm/Analysis/TargetTransformInfo.h"
12 #include "llvm/IR/IRBuilder.h"
13 #include "llvm/IR/IntrinsicInst.h"
14 #include "llvm/IR/MDBuilder.h"
15 #include "llvm/Support/Debug.h"
16 #include "llvm/Support/MathExtras.h"
17 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
18 #include <optional>
19 
20 #define DEBUG_TYPE "lower-mem-intrinsics"
21 
22 using namespace llvm;
23 
24 void llvm::createMemCpyLoopKnownSize(
25     Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr,
26     ConstantInt *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile,
27     bool DstIsVolatile, bool CanOverlap, const TargetTransformInfo &TTI,
28     std::optional<uint32_t> AtomicElementSize) {
29   // No need to expand zero length copies.
30   if (CopyLen->isZero())
31     return;
32 
33   BasicBlock *PreLoopBB = InsertBefore->getParent();
34   BasicBlock *PostLoopBB = nullptr;
35   Function *ParentFunc = PreLoopBB->getParent();
36   LLVMContext &Ctx = PreLoopBB->getContext();
37   const DataLayout &DL = ParentFunc->getDataLayout();
38   MDBuilder MDB(Ctx);
39   MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("MemCopyDomain");
40   StringRef Name = "MemCopyAliasScope";
41   MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
42 
43   unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
44   unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
45 
46   Type *TypeOfCopyLen = CopyLen->getType();
47   Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
48       Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
49   assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
50          "Atomic memcpy lowering is not supported for vector operand type");
51 
52   Type *Int8Type = Type::getInt8Ty(Ctx);
53   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
54   assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
55          "Atomic memcpy lowering is not supported for selected operand size");
56 
57   uint64_t LoopEndCount = alignDown(CopyLen->getZExtValue(), LoopOpSize);
58 
59   if (LoopEndCount != 0) {
60     // Split
61     PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split");
62     BasicBlock *LoopBB =
63         BasicBlock::Create(Ctx, "load-store-loop", ParentFunc, PostLoopBB);
64     PreLoopBB->getTerminator()->setSuccessor(0, LoopBB);
65 
66     IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
67 
68     Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
69     Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
70 
71     IRBuilder<> LoopBuilder(LoopBB);
72     PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
73     LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB);
74     // Loop Body
75 
76     // If we used LoopOpType as GEP element type, we would iterate over the
77     // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
78     // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
79     // byte offsets computed from the TypeStoreSize.
80     Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopIndex);
81     LoadInst *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
82                                                    PartSrcAlign, SrcIsVolatile);
83     if (!CanOverlap) {
84       // Set alias scope for loads.
85       Load->setMetadata(LLVMContext::MD_alias_scope,
86                         MDNode::get(Ctx, NewScope));
87     }
88     Value *DstGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopIndex);
89     StoreInst *Store = LoopBuilder.CreateAlignedStore(
90         Load, DstGEP, PartDstAlign, DstIsVolatile);
91     if (!CanOverlap) {
92       // Indicate that stores don't overlap loads.
93       Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
94     }
95     if (AtomicElementSize) {
96       Load->setAtomic(AtomicOrdering::Unordered);
97       Store->setAtomic(AtomicOrdering::Unordered);
98     }
99     Value *NewIndex = LoopBuilder.CreateAdd(
100         LoopIndex, ConstantInt::get(TypeOfCopyLen, LoopOpSize));
101     LoopIndex->addIncoming(NewIndex, LoopBB);
102 
103     // Create the loop branch condition.
104     Constant *LoopEndCI = ConstantInt::get(TypeOfCopyLen, LoopEndCount);
105     LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopEndCI),
106                              LoopBB, PostLoopBB);
107   }
108 
109   uint64_t BytesCopied = LoopEndCount;
110   uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
111   if (RemainingBytes) {
112     BasicBlock::iterator InsertIt = PostLoopBB ? PostLoopBB->getFirstNonPHIIt()
113                                                : InsertBefore->getIterator();
114     IRBuilder<> RBuilder(InsertIt->getParent(), InsertIt);
115 
116     SmallVector<Type *, 5> RemainingOps;
117     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
118                                           SrcAS, DstAS, SrcAlign, DstAlign,
119                                           AtomicElementSize);
120 
121     for (auto *OpTy : RemainingOps) {
122       Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
123       Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
124 
125       unsigned OperandSize = DL.getTypeStoreSize(OpTy);
126       assert(
127           (!AtomicElementSize || OperandSize % *AtomicElementSize == 0) &&
128           "Atomic memcpy lowering is not supported for selected operand size");
129 
130       Value *SrcGEP = RBuilder.CreateInBoundsGEP(
131           Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
132       LoadInst *Load =
133           RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
134       if (!CanOverlap) {
135         // Set alias scope for loads.
136         Load->setMetadata(LLVMContext::MD_alias_scope,
137                           MDNode::get(Ctx, NewScope));
138       }
139       Value *DstGEP = RBuilder.CreateInBoundsGEP(
140           Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
141       StoreInst *Store = RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign,
142                                                      DstIsVolatile);
143       if (!CanOverlap) {
144         // Indicate that stores don't overlap loads.
145         Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
146       }
147       if (AtomicElementSize) {
148         Load->setAtomic(AtomicOrdering::Unordered);
149         Store->setAtomic(AtomicOrdering::Unordered);
150       }
151       BytesCopied += OperandSize;
152     }
153   }
154   assert(BytesCopied == CopyLen->getZExtValue() &&
155          "Bytes copied should match size in the call!");
156 }
157 
158 // \returns \p Len urem \p OpSize, checking for optimization opportunities.
159 static Value *getRuntimeLoopRemainder(const DataLayout &DL, IRBuilderBase &B,
160                                       Value *Len, Value *OpSize,
161                                       unsigned OpSizeVal) {
162   // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem.
163   if (isPowerOf2_32(OpSizeVal))
164     return B.CreateAnd(Len, OpSizeVal - 1);
165   return B.CreateURem(Len, OpSize);
166 }
167 
168 // \returns (\p Len udiv \p OpSize) mul \p OpSize, checking for optimization
169 // opportunities.
170 // If RTLoopRemainder is provided, it must be the result of
171 // getRuntimeLoopRemainder() with the same arguments.
172 static Value *getRuntimeLoopBytes(const DataLayout &DL, IRBuilderBase &B,
173                                   Value *Len, Value *OpSize, unsigned OpSizeVal,
174                                   Value *RTLoopRemainder = nullptr) {
175   if (!RTLoopRemainder)
176     RTLoopRemainder = getRuntimeLoopRemainder(DL, B, Len, OpSize, OpSizeVal);
177   return B.CreateSub(Len, RTLoopRemainder);
178 }
179 
180 void llvm::createMemCpyLoopUnknownSize(
181     Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
182     Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
183     bool CanOverlap, const TargetTransformInfo &TTI,
184     std::optional<uint32_t> AtomicElementSize) {
185   BasicBlock *PreLoopBB = InsertBefore->getParent();
186   BasicBlock *PostLoopBB =
187       PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion");
188 
189   Function *ParentFunc = PreLoopBB->getParent();
190   const DataLayout &DL = ParentFunc->getDataLayout();
191   LLVMContext &Ctx = PreLoopBB->getContext();
192   MDBuilder MDB(Ctx);
193   MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("MemCopyDomain");
194   StringRef Name = "MemCopyAliasScope";
195   MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
196 
197   unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
198   unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
199 
200   Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
201       Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
202   assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
203          "Atomic memcpy lowering is not supported for vector operand type");
204   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
205   assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
206          "Atomic memcpy lowering is not supported for selected operand size");
207 
208   IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
209 
210   // Calculate the loop trip count, and remaining bytes to copy after the loop.
211   Type *CopyLenType = CopyLen->getType();
212   IntegerType *ILengthType = dyn_cast<IntegerType>(CopyLenType);
213   assert(ILengthType &&
214          "expected size argument to memcpy to be an integer type!");
215   Type *Int8Type = Type::getInt8Ty(Ctx);
216   bool LoopOpIsInt8 = LoopOpType == Int8Type;
217   ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
218 
219   Value *RuntimeLoopBytes = CopyLen;
220   Value *RuntimeResidualBytes = nullptr;
221   if (!LoopOpIsInt8) {
222     RuntimeResidualBytes = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
223                                                    CILoopOpSize, LoopOpSize);
224     RuntimeLoopBytes = getRuntimeLoopBytes(DL, PLBuilder, CopyLen, CILoopOpSize,
225                                            LoopOpSize, RuntimeResidualBytes);
226   }
227 
228   BasicBlock *LoopBB =
229       BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
230   IRBuilder<> LoopBuilder(LoopBB);
231 
232   Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
233   Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
234 
235   PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
236   LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
237 
238   // If we used LoopOpType as GEP element type, we would iterate over the
239   // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
240   // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use byte
241   // offsets computed from the TypeStoreSize.
242   Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopIndex);
243   LoadInst *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
244                                                  PartSrcAlign, SrcIsVolatile);
245   if (!CanOverlap) {
246     // Set alias scope for loads.
247     Load->setMetadata(LLVMContext::MD_alias_scope, MDNode::get(Ctx, NewScope));
248   }
249   Value *DstGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopIndex);
250   StoreInst *Store =
251       LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
252   if (!CanOverlap) {
253     // Indicate that stores don't overlap loads.
254     Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
255   }
256   if (AtomicElementSize) {
257     Load->setAtomic(AtomicOrdering::Unordered);
258     Store->setAtomic(AtomicOrdering::Unordered);
259   }
260   Value *NewIndex = LoopBuilder.CreateAdd(
261       LoopIndex, ConstantInt::get(CopyLenType, LoopOpSize));
262   LoopIndex->addIncoming(NewIndex, LoopBB);
263 
264   bool RequiresResidual =
265       !LoopOpIsInt8 && !(AtomicElementSize && LoopOpSize == AtomicElementSize);
266   if (RequiresResidual) {
267     Type *ResLoopOpType = AtomicElementSize
268                               ? Type::getIntNTy(Ctx, *AtomicElementSize * 8)
269                               : Int8Type;
270     unsigned ResLoopOpSize = DL.getTypeStoreSize(ResLoopOpType);
271     assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) &&
272            "Store size is expected to match type size");
273 
274     Align ResSrcAlign(commonAlignment(PartSrcAlign, ResLoopOpSize));
275     Align ResDstAlign(commonAlignment(PartDstAlign, ResLoopOpSize));
276 
277     // Loop body for the residual copy.
278     BasicBlock *ResLoopBB = BasicBlock::Create(
279         Ctx, "loop-memcpy-residual", PreLoopBB->getParent(), PostLoopBB);
280     // Residual loop header.
281     BasicBlock *ResHeaderBB = BasicBlock::Create(
282         Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr);
283 
284     // Need to update the pre-loop basic block to branch to the correct place.
285     // branch to the main loop if the count is non-zero, branch to the residual
286     // loop if the copy size is smaller then 1 iteration of the main loop but
287     // non-zero and finally branch to after the residual loop if the memcpy
288     //  size is zero.
289     ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
290     PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopBytes, Zero),
291                            LoopBB, ResHeaderBB);
292     PreLoopBB->getTerminator()->eraseFromParent();
293 
294     LoopBuilder.CreateCondBr(
295         LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopBytes), LoopBB,
296         ResHeaderBB);
297 
298     // Determine if we need to branch to the residual loop or bypass it.
299     IRBuilder<> RHBuilder(ResHeaderBB);
300     RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidualBytes, Zero),
301                            ResLoopBB, PostLoopBB);
302 
303     // Copy the residual with single byte load/store loop.
304     IRBuilder<> ResBuilder(ResLoopBB);
305     PHINode *ResidualIndex =
306         ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index");
307     ResidualIndex->addIncoming(Zero, ResHeaderBB);
308 
309     Value *FullOffset = ResBuilder.CreateAdd(RuntimeLoopBytes, ResidualIndex);
310     Value *SrcGEP = ResBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, FullOffset);
311     LoadInst *Load = ResBuilder.CreateAlignedLoad(ResLoopOpType, SrcGEP,
312                                                   ResSrcAlign, SrcIsVolatile);
313     if (!CanOverlap) {
314       // Set alias scope for loads.
315       Load->setMetadata(LLVMContext::MD_alias_scope,
316                         MDNode::get(Ctx, NewScope));
317     }
318     Value *DstGEP = ResBuilder.CreateInBoundsGEP(Int8Type, DstAddr, FullOffset);
319     StoreInst *Store =
320         ResBuilder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
321     if (!CanOverlap) {
322       // Indicate that stores don't overlap loads.
323       Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
324     }
325     if (AtomicElementSize) {
326       Load->setAtomic(AtomicOrdering::Unordered);
327       Store->setAtomic(AtomicOrdering::Unordered);
328     }
329     Value *ResNewIndex = ResBuilder.CreateAdd(
330         ResidualIndex, ConstantInt::get(CopyLenType, ResLoopOpSize));
331     ResidualIndex->addIncoming(ResNewIndex, ResLoopBB);
332 
333     // Create the loop branch condition.
334     ResBuilder.CreateCondBr(
335         ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidualBytes), ResLoopBB,
336         PostLoopBB);
337   } else {
338     // In this case the loop operand type was a byte, and there is no need for a
339     // residual loop to copy the remaining memory after the main loop.
340     // We do however need to patch up the control flow by creating the
341     // terminators for the preloop block and the memcpy loop.
342     ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
343     PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopBytes, Zero),
344                            LoopBB, PostLoopBB);
345     PreLoopBB->getTerminator()->eraseFromParent();
346     LoopBuilder.CreateCondBr(
347         LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopBytes), LoopBB,
348         PostLoopBB);
349   }
350 }
351 
352 // If \p Addr1 and \p Addr2 are pointers to different address spaces, create an
353 // addresspacecast to obtain a pair of pointers in the same addressspace. The
354 // caller needs to ensure that addrspacecasting is possible.
355 // No-op if the pointers are in the same address space.
356 static std::pair<Value *, Value *>
357 tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2,
358                                const TargetTransformInfo &TTI) {
359   Value *ResAddr1 = Addr1;
360   Value *ResAddr2 = Addr2;
361 
362   unsigned AS1 = cast<PointerType>(Addr1->getType())->getAddressSpace();
363   unsigned AS2 = cast<PointerType>(Addr2->getType())->getAddressSpace();
364   if (AS1 != AS2) {
365     if (TTI.isValidAddrSpaceCast(AS2, AS1))
366       ResAddr2 = B.CreateAddrSpaceCast(Addr2, Addr1->getType());
367     else if (TTI.isValidAddrSpaceCast(AS1, AS2))
368       ResAddr1 = B.CreateAddrSpaceCast(Addr1, Addr2->getType());
369     else
370       llvm_unreachable("Can only lower memmove between address spaces if they "
371                        "support addrspacecast");
372   }
373   return {ResAddr1, ResAddr2};
374 }
375 
376 // Lower memmove to IR. memmove is required to correctly copy overlapping memory
377 // regions; therefore, it has to check the relative positions of the source and
378 // destination pointers and choose the copy direction accordingly.
379 //
380 // The code below is an IR rendition of this C function:
381 //
382 // void* memmove(void* dst, const void* src, size_t n) {
383 //   unsigned char* d = dst;
384 //   const unsigned char* s = src;
385 //   if (s < d) {
386 //     // copy backwards
387 //     while (n--) {
388 //       d[n] = s[n];
389 //     }
390 //   } else {
391 //     // copy forward
392 //     for (size_t i = 0; i < n; ++i) {
393 //       d[i] = s[i];
394 //     }
395 //   }
396 //   return dst;
397 // }
398 //
399 // If the TargetTransformInfo specifies a wider MemcpyLoopLoweringType, it is
400 // used for the memory accesses in the loops. Then, additional loops with
401 // byte-wise accesses are added for the remaining bytes.
402 static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
403                                          Value *SrcAddr, Value *DstAddr,
404                                          Value *CopyLen, Align SrcAlign,
405                                          Align DstAlign, bool SrcIsVolatile,
406                                          bool DstIsVolatile,
407                                          const TargetTransformInfo &TTI) {
408   Type *TypeOfCopyLen = CopyLen->getType();
409   BasicBlock *OrigBB = InsertBefore->getParent();
410   Function *F = OrigBB->getParent();
411   const DataLayout &DL = F->getDataLayout();
412   LLVMContext &Ctx = OrigBB->getContext();
413   unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
414   unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
415 
416   Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
417                                                    SrcAlign, DstAlign);
418   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
419   Type *Int8Type = Type::getInt8Ty(Ctx);
420   bool LoopOpIsInt8 = LoopOpType == Int8Type;
421 
422   // If the memory accesses are wider than one byte, residual loops with
423   // i8-accesses are required to move remaining bytes.
424   bool RequiresResidual = !LoopOpIsInt8;
425 
426   Type *ResidualLoopOpType = Int8Type;
427   unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
428 
429   // Calculate the loop trip count and remaining bytes to copy after the loop.
430   IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
431   ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
432   ConstantInt *CIResidualLoopOpSize =
433       ConstantInt::get(ILengthType, ResidualLoopOpSize);
434   ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
435 
436   IRBuilder<> PLBuilder(InsertBefore);
437 
438   Value *RuntimeLoopBytes = CopyLen;
439   Value *RuntimeLoopRemainder = nullptr;
440   Value *SkipResidualCondition = nullptr;
441   if (RequiresResidual) {
442     RuntimeLoopRemainder = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
443                                                    CILoopOpSize, LoopOpSize);
444     RuntimeLoopBytes = getRuntimeLoopBytes(DL, PLBuilder, CopyLen, CILoopOpSize,
445                                            LoopOpSize, RuntimeLoopRemainder);
446     SkipResidualCondition =
447         PLBuilder.CreateICmpEQ(RuntimeLoopRemainder, Zero, "skip_residual");
448   }
449   Value *SkipMainCondition =
450       PLBuilder.CreateICmpEQ(RuntimeLoopBytes, Zero, "skip_main");
451 
452   // Create the a comparison of src and dst, based on which we jump to either
453   // the forward-copy part of the function (if src >= dst) or the backwards-copy
454   // part (if src < dst).
455   // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
456   // structure. Its block terminators (unconditional branches) are replaced by
457   // the appropriate conditional branches when the loop is built.
458   // If the pointers are in different address spaces, they need to be converted
459   // to a compatible one. Cases where memory ranges in the different address
460   // spaces cannot overlap are lowered as memcpy and not handled here.
461   auto [CmpSrcAddr, CmpDstAddr] =
462       tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
463   Value *PtrCompare =
464       PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
465   Instruction *ThenTerm, *ElseTerm;
466   SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
467                                 &ThenTerm, &ElseTerm);
468 
469   // If the LoopOpSize is greater than 1, each part of the function consists of
470   // four blocks:
471   //   memmove_copy_backwards:
472   //       skip the residual loop when 0 iterations are required
473   //   memmove_bwd_residual_loop:
474   //       copy the last few bytes individually so that the remaining length is
475   //       a multiple of the LoopOpSize
476   //   memmove_bwd_middle: skip the main loop when 0 iterations are required
477   //   memmove_bwd_main_loop: the actual backwards loop BB with wide accesses
478   //   memmove_copy_forward: skip the main loop when 0 iterations are required
479   //   memmove_fwd_main_loop: the actual forward loop BB with wide accesses
480   //   memmove_fwd_middle: skip the residual loop when 0 iterations are required
481   //   memmove_fwd_residual_loop: copy the last few bytes individually
482   //
483   // The main and residual loop are switched between copying forward and
484   // backward so that the residual loop always operates on the end of the moved
485   // range. This is based on the assumption that buffers whose start is aligned
486   // with the LoopOpSize are more common than buffers whose end is.
487   //
488   // If the LoopOpSize is 1, each part of the function consists of two blocks:
489   //   memmove_copy_backwards: skip the loop when 0 iterations are required
490   //   memmove_bwd_main_loop: the actual backwards loop BB
491   //   memmove_copy_forward: skip the loop when 0 iterations are required
492   //   memmove_fwd_main_loop: the actual forward loop BB
493   BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
494   CopyBackwardsBB->setName("memmove_copy_backwards");
495   BasicBlock *CopyForwardBB = ElseTerm->getParent();
496   CopyForwardBB->setName("memmove_copy_forward");
497   BasicBlock *ExitBB = InsertBefore->getParent();
498   ExitBB->setName("memmove_done");
499 
500   Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
501   Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
502 
503   // Accesses in the residual loops do not share the same alignment as those in
504   // the main loops.
505   Align ResidualSrcAlign(commonAlignment(PartSrcAlign, ResidualLoopOpSize));
506   Align ResidualDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
507 
508   // Copying backwards.
509   {
510     BasicBlock *MainLoopBB = BasicBlock::Create(
511         F->getContext(), "memmove_bwd_main_loop", F, CopyForwardBB);
512 
513     // The predecessor of the memmove_bwd_main_loop. Updated in the
514     // following if a residual loop is emitted first.
515     BasicBlock *PredBB = CopyBackwardsBB;
516 
517     if (RequiresResidual) {
518       // backwards residual loop
519       BasicBlock *ResidualLoopBB = BasicBlock::Create(
520           F->getContext(), "memmove_bwd_residual_loop", F, MainLoopBB);
521       IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
522       PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(ILengthType, 0);
523       Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
524           ResidualLoopPhi, CIResidualLoopOpSize, "bwd_residual_index");
525       // If we used LoopOpType as GEP element type, we would iterate over the
526       // buffers in TypeStoreSize strides while copying TypeAllocSize bytes,
527       // i.e., we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore,
528       // use byte offsets computed from the TypeStoreSize.
529       Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
530                                                              ResidualIndex);
531       Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
532           ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
533           "element");
534       Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
535                                                               ResidualIndex);
536       ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
537                                              ResidualDstAlign, DstIsVolatile);
538 
539       // After the residual loop, go to an intermediate block.
540       BasicBlock *IntermediateBB = BasicBlock::Create(
541           F->getContext(), "memmove_bwd_middle", F, MainLoopBB);
542       // Later code expects a terminator in the PredBB.
543       IRBuilder<> IntermediateBuilder(IntermediateBB);
544       IntermediateBuilder.CreateUnreachable();
545       ResidualLoopBuilder.CreateCondBr(
546           ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, RuntimeLoopBytes),
547           IntermediateBB, ResidualLoopBB);
548 
549       ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
550       ResidualLoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
551 
552       // How to get to the residual:
553       BranchInst::Create(IntermediateBB, ResidualLoopBB, SkipResidualCondition,
554                          ThenTerm->getIterator());
555       ThenTerm->eraseFromParent();
556 
557       PredBB = IntermediateBB;
558     }
559 
560     // main loop
561     IRBuilder<> MainLoopBuilder(MainLoopBB);
562     PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(ILengthType, 0);
563     Value *MainIndex =
564         MainLoopBuilder.CreateSub(MainLoopPhi, CILoopOpSize, "bwd_main_index");
565     Value *LoadGEP =
566         MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, MainIndex);
567     Value *Element = MainLoopBuilder.CreateAlignedLoad(
568         LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
569     Value *StoreGEP =
570         MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, MainIndex);
571     MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
572                                        DstIsVolatile);
573     MainLoopBuilder.CreateCondBr(MainLoopBuilder.CreateICmpEQ(MainIndex, Zero),
574                                  ExitBB, MainLoopBB);
575     MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
576     MainLoopPhi->addIncoming(RuntimeLoopBytes, PredBB);
577 
578     // How to get to the main loop:
579     Instruction *PredBBTerm = PredBB->getTerminator();
580     BranchInst::Create(ExitBB, MainLoopBB, SkipMainCondition,
581                        PredBBTerm->getIterator());
582     PredBBTerm->eraseFromParent();
583   }
584 
585   // Copying forward.
586   // main loop
587   {
588     BasicBlock *MainLoopBB =
589         BasicBlock::Create(F->getContext(), "memmove_fwd_main_loop", F, ExitBB);
590     IRBuilder<> MainLoopBuilder(MainLoopBB);
591     PHINode *MainLoopPhi =
592         MainLoopBuilder.CreatePHI(ILengthType, 0, "fwd_main_index");
593     Value *LoadGEP =
594         MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, MainLoopPhi);
595     Value *Element = MainLoopBuilder.CreateAlignedLoad(
596         LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
597     Value *StoreGEP =
598         MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, MainLoopPhi);
599     MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
600                                        DstIsVolatile);
601     Value *MainIndex = MainLoopBuilder.CreateAdd(MainLoopPhi, CILoopOpSize);
602     MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
603     MainLoopPhi->addIncoming(Zero, CopyForwardBB);
604 
605     Instruction *CopyFwdBBTerm = CopyForwardBB->getTerminator();
606     BasicBlock *SuccessorBB = ExitBB;
607     if (RequiresResidual)
608       SuccessorBB =
609           BasicBlock::Create(F->getContext(), "memmove_fwd_middle", F, ExitBB);
610 
611     // leaving or staying in the main loop
612     MainLoopBuilder.CreateCondBr(
613         MainLoopBuilder.CreateICmpEQ(MainIndex, RuntimeLoopBytes), SuccessorBB,
614         MainLoopBB);
615 
616     // getting in or skipping the main loop
617     BranchInst::Create(SuccessorBB, MainLoopBB, SkipMainCondition,
618                        CopyFwdBBTerm->getIterator());
619     CopyFwdBBTerm->eraseFromParent();
620 
621     if (RequiresResidual) {
622       BasicBlock *IntermediateBB = SuccessorBB;
623       IRBuilder<> IntermediateBuilder(IntermediateBB);
624       BasicBlock *ResidualLoopBB = BasicBlock::Create(
625           F->getContext(), "memmove_fwd_residual_loop", F, ExitBB);
626       IntermediateBuilder.CreateCondBr(SkipResidualCondition, ExitBB,
627                                        ResidualLoopBB);
628 
629       // Residual loop
630       IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
631       PHINode *ResidualLoopPhi =
632           ResidualLoopBuilder.CreatePHI(ILengthType, 0, "fwd_residual_index");
633       Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
634                                                              ResidualLoopPhi);
635       Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
636           ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
637           "element");
638       Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
639                                                               ResidualLoopPhi);
640       ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
641                                              ResidualDstAlign, DstIsVolatile);
642       Value *ResidualIndex =
643           ResidualLoopBuilder.CreateAdd(ResidualLoopPhi, CIResidualLoopOpSize);
644       ResidualLoopBuilder.CreateCondBr(
645           ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, CopyLen), ExitBB,
646           ResidualLoopBB);
647       ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
648       ResidualLoopPhi->addIncoming(RuntimeLoopBytes, IntermediateBB);
649     }
650   }
651 }
652 
653 // Similar to createMemMoveLoopUnknownSize, only the trip counts are computed at
654 // compile time, obsolete loops and branches are omitted, and the residual code
655 // is straight-line code instead of a loop.
656 static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
657                                        Value *SrcAddr, Value *DstAddr,
658                                        ConstantInt *CopyLen, Align SrcAlign,
659                                        Align DstAlign, bool SrcIsVolatile,
660                                        bool DstIsVolatile,
661                                        const TargetTransformInfo &TTI) {
662   // No need to expand zero length moves.
663   if (CopyLen->isZero())
664     return;
665 
666   Type *TypeOfCopyLen = CopyLen->getType();
667   BasicBlock *OrigBB = InsertBefore->getParent();
668   Function *F = OrigBB->getParent();
669   const DataLayout &DL = F->getDataLayout();
670   LLVMContext &Ctx = OrigBB->getContext();
671   unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
672   unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
673 
674   Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
675                                                    SrcAlign, DstAlign);
676   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
677   Type *Int8Type = Type::getInt8Ty(Ctx);
678 
679   // Calculate the loop trip count and remaining bytes to copy after the loop.
680   uint64_t BytesCopiedInLoop = alignDown(CopyLen->getZExtValue(), LoopOpSize);
681   uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopiedInLoop;
682 
683   IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
684   ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
685   ConstantInt *LoopBound = ConstantInt::get(ILengthType, BytesCopiedInLoop);
686   ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
687 
688   IRBuilder<> PLBuilder(InsertBefore);
689 
690   auto [CmpSrcAddr, CmpDstAddr] =
691       tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
692   Value *PtrCompare =
693       PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
694   Instruction *ThenTerm, *ElseTerm;
695   SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
696                                 &ThenTerm, &ElseTerm);
697 
698   BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
699   BasicBlock *CopyForwardBB = ElseTerm->getParent();
700   BasicBlock *ExitBB = InsertBefore->getParent();
701   ExitBB->setName("memmove_done");
702 
703   Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
704   Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
705 
706   // Helper function to generate a load/store pair of a given type in the
707   // residual. Used in the forward and backward branches.
708   auto GenerateResidualLdStPair = [&](Type *OpTy, IRBuilderBase &Builder,
709                                       uint64_t &BytesCopied) {
710     Align ResSrcAlign(commonAlignment(SrcAlign, BytesCopied));
711     Align ResDstAlign(commonAlignment(DstAlign, BytesCopied));
712 
713     unsigned OperandSize = DL.getTypeStoreSize(OpTy);
714 
715     // If we used LoopOpType as GEP element type, we would iterate over the
716     // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
717     // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
718     // byte offsets computed from the TypeStoreSize.
719     Value *SrcGEP = Builder.CreateInBoundsGEP(
720         Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
721     LoadInst *Load =
722         Builder.CreateAlignedLoad(OpTy, SrcGEP, ResSrcAlign, SrcIsVolatile);
723     Value *DstGEP = Builder.CreateInBoundsGEP(
724         Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
725     Builder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
726     BytesCopied += OperandSize;
727   };
728 
729   // Copying backwards.
730   if (RemainingBytes != 0) {
731     CopyBackwardsBB->setName("memmove_bwd_residual");
732     uint64_t BytesCopied = BytesCopiedInLoop;
733 
734     // Residual code is required to move the remaining bytes. We need the same
735     // instructions as in the forward case, only in reverse. So we generate code
736     // the same way, except that we change the IRBuilder insert point for each
737     // load/store pair so that each one is inserted before the previous one
738     // instead of after it.
739     IRBuilder<> BwdResBuilder(CopyBackwardsBB,
740                               CopyBackwardsBB->getFirstNonPHIIt());
741     SmallVector<Type *, 5> RemainingOps;
742     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
743                                           SrcAS, DstAS, PartSrcAlign,
744                                           PartDstAlign);
745     for (auto *OpTy : RemainingOps) {
746       // reverse the order of the emitted operations
747       BwdResBuilder.SetInsertPoint(CopyBackwardsBB,
748                                    CopyBackwardsBB->getFirstNonPHIIt());
749       GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied);
750     }
751   }
752   if (BytesCopiedInLoop != 0) {
753     BasicBlock *LoopBB = CopyBackwardsBB;
754     BasicBlock *PredBB = OrigBB;
755     if (RemainingBytes != 0) {
756       // if we introduce residual code, it needs its separate BB
757       LoopBB = CopyBackwardsBB->splitBasicBlock(
758           CopyBackwardsBB->getTerminator(), "memmove_bwd_loop");
759       PredBB = CopyBackwardsBB;
760     } else {
761       CopyBackwardsBB->setName("memmove_bwd_loop");
762     }
763     IRBuilder<> LoopBuilder(LoopBB->getTerminator());
764     PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0);
765     Value *Index = LoopBuilder.CreateSub(LoopPhi, CILoopOpSize, "bwd_index");
766     Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, Index);
767     Value *Element = LoopBuilder.CreateAlignedLoad(
768         LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
769     Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, Index);
770     LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
771                                    DstIsVolatile);
772 
773     // Replace the unconditional branch introduced by
774     // SplitBlockAndInsertIfThenElse to turn LoopBB into a loop.
775     Instruction *UncondTerm = LoopBB->getTerminator();
776     LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, Zero), ExitBB,
777                              LoopBB);
778     UncondTerm->eraseFromParent();
779 
780     LoopPhi->addIncoming(Index, LoopBB);
781     LoopPhi->addIncoming(LoopBound, PredBB);
782   }
783 
784   // Copying forward.
785   BasicBlock *FwdResidualBB = CopyForwardBB;
786   if (BytesCopiedInLoop != 0) {
787     CopyForwardBB->setName("memmove_fwd_loop");
788     BasicBlock *LoopBB = CopyForwardBB;
789     BasicBlock *SuccBB = ExitBB;
790     if (RemainingBytes != 0) {
791       // if we introduce residual code, it needs its separate BB
792       SuccBB = CopyForwardBB->splitBasicBlock(CopyForwardBB->getTerminator(),
793                                               "memmove_fwd_residual");
794       FwdResidualBB = SuccBB;
795     }
796     IRBuilder<> LoopBuilder(LoopBB->getTerminator());
797     PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0, "fwd_index");
798     Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopPhi);
799     Value *Element = LoopBuilder.CreateAlignedLoad(
800         LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
801     Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopPhi);
802     LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
803                                    DstIsVolatile);
804     Value *Index = LoopBuilder.CreateAdd(LoopPhi, CILoopOpSize);
805     LoopPhi->addIncoming(Index, LoopBB);
806     LoopPhi->addIncoming(Zero, OrigBB);
807 
808     // Replace the unconditional branch to turn LoopBB into a loop.
809     Instruction *UncondTerm = LoopBB->getTerminator();
810     LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, LoopBound), SuccBB,
811                              LoopBB);
812     UncondTerm->eraseFromParent();
813   }
814 
815   if (RemainingBytes != 0) {
816     uint64_t BytesCopied = BytesCopiedInLoop;
817 
818     // Residual code is required to move the remaining bytes. In the forward
819     // case, we emit it in the normal order.
820     IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator());
821     SmallVector<Type *, 5> RemainingOps;
822     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
823                                           SrcAS, DstAS, PartSrcAlign,
824                                           PartDstAlign);
825     for (auto *OpTy : RemainingOps)
826       GenerateResidualLdStPair(OpTy, FwdResBuilder, BytesCopied);
827   }
828 }
829 
830 static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
831                              Value *CopyLen, Value *SetValue, Align DstAlign,
832                              bool IsVolatile) {
833   Type *TypeOfCopyLen = CopyLen->getType();
834   BasicBlock *OrigBB = InsertBefore->getParent();
835   Function *F = OrigBB->getParent();
836   const DataLayout &DL = F->getDataLayout();
837   BasicBlock *NewBB =
838       OrigBB->splitBasicBlock(InsertBefore, "split");
839   BasicBlock *LoopBB
840     = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
841 
842   IRBuilder<> Builder(OrigBB->getTerminator());
843 
844   Builder.CreateCondBr(
845       Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
846       LoopBB);
847   OrigBB->getTerminator()->eraseFromParent();
848 
849   unsigned PartSize = DL.getTypeStoreSize(SetValue->getType());
850   Align PartAlign(commonAlignment(DstAlign, PartSize));
851 
852   IRBuilder<> LoopBuilder(LoopBB);
853   PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
854   LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
855 
856   LoopBuilder.CreateAlignedStore(
857       SetValue,
858       LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
859       PartAlign, IsVolatile);
860 
861   Value *NewIndex =
862       LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
863   LoopIndex->addIncoming(NewIndex, LoopBB);
864 
865   LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
866                            NewBB);
867 }
868 
869 template <typename T>
870 static bool canOverlap(MemTransferBase<T> *Memcpy, ScalarEvolution *SE) {
871   if (SE) {
872     const SCEV *SrcSCEV = SE->getSCEV(Memcpy->getRawSource());
873     const SCEV *DestSCEV = SE->getSCEV(Memcpy->getRawDest());
874     if (SE->isKnownPredicateAt(CmpInst::ICMP_NE, SrcSCEV, DestSCEV, Memcpy))
875       return false;
876   }
877   return true;
878 }
879 
880 void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
881                               const TargetTransformInfo &TTI,
882                               ScalarEvolution *SE) {
883   bool CanOverlap = canOverlap(Memcpy, SE);
884   if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
885     createMemCpyLoopKnownSize(
886         /* InsertBefore */ Memcpy,
887         /* SrcAddr */ Memcpy->getRawSource(),
888         /* DstAddr */ Memcpy->getRawDest(),
889         /* CopyLen */ CI,
890         /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
891         /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
892         /* SrcIsVolatile */ Memcpy->isVolatile(),
893         /* DstIsVolatile */ Memcpy->isVolatile(),
894         /* CanOverlap */ CanOverlap,
895         /* TargetTransformInfo */ TTI);
896   } else {
897     createMemCpyLoopUnknownSize(
898         /* InsertBefore */ Memcpy,
899         /* SrcAddr */ Memcpy->getRawSource(),
900         /* DstAddr */ Memcpy->getRawDest(),
901         /* CopyLen */ Memcpy->getLength(),
902         /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
903         /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
904         /* SrcIsVolatile */ Memcpy->isVolatile(),
905         /* DstIsVolatile */ Memcpy->isVolatile(),
906         /* CanOverlap */ CanOverlap,
907         /* TargetTransformInfo */ TTI);
908   }
909 }
910 
911 bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
912                                const TargetTransformInfo &TTI) {
913   Value *CopyLen = Memmove->getLength();
914   Value *SrcAddr = Memmove->getRawSource();
915   Value *DstAddr = Memmove->getRawDest();
916   Align SrcAlign = Memmove->getSourceAlign().valueOrOne();
917   Align DstAlign = Memmove->getDestAlign().valueOrOne();
918   bool SrcIsVolatile = Memmove->isVolatile();
919   bool DstIsVolatile = SrcIsVolatile;
920   IRBuilder<> CastBuilder(Memmove);
921 
922   unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
923   unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();
924   if (SrcAS != DstAS) {
925     if (!TTI.addrspacesMayAlias(SrcAS, DstAS)) {
926       // We may not be able to emit a pointer comparison, but we don't have
927       // to. Expand as memcpy.
928       if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
929         createMemCpyLoopKnownSize(/*InsertBefore=*/Memmove, SrcAddr, DstAddr,
930                                   CI, SrcAlign, DstAlign, SrcIsVolatile,
931                                   DstIsVolatile,
932                                   /*CanOverlap=*/false, TTI);
933       } else {
934         createMemCpyLoopUnknownSize(/*InsertBefore=*/Memmove, SrcAddr, DstAddr,
935                                     CopyLen, SrcAlign, DstAlign, SrcIsVolatile,
936                                     DstIsVolatile,
937                                     /*CanOverlap=*/false, TTI);
938       }
939 
940       return true;
941     }
942 
943     if (!(TTI.isValidAddrSpaceCast(DstAS, SrcAS) ||
944           TTI.isValidAddrSpaceCast(SrcAS, DstAS))) {
945       // We don't know generically if it's legal to introduce an
946       // addrspacecast. We need to know either if it's legal to insert an
947       // addrspacecast, or if the address spaces cannot alias.
948       LLVM_DEBUG(
949           dbgs() << "Do not know how to expand memmove between different "
950                     "address spaces\n");
951       return false;
952     }
953   }
954 
955   if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
956     createMemMoveLoopKnownSize(
957         /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CI, SrcAlign, DstAlign,
958         SrcIsVolatile, DstIsVolatile, TTI);
959   } else {
960     createMemMoveLoopUnknownSize(
961         /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
962         SrcIsVolatile, DstIsVolatile, TTI);
963   }
964   return true;
965 }
966 
967 void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
968   createMemSetLoop(/* InsertBefore */ Memset,
969                    /* DstAddr */ Memset->getRawDest(),
970                    /* CopyLen */ Memset->getLength(),
971                    /* SetValue */ Memset->getValue(),
972                    /* Alignment */ Memset->getDestAlign().valueOrOne(),
973                    Memset->isVolatile());
974 }
975 
976 void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) {
977   createMemSetLoop(/* InsertBefore=*/Memset,
978                    /* DstAddr=*/Memset->getRawDest(),
979                    /* CopyLen=*/Memset->getLength(),
980                    /* SetValue=*/Memset->getValue(),
981                    /* Alignment=*/Memset->getDestAlign().valueOrOne(),
982                    Memset->isVolatile());
983 }
984 
985 void llvm::expandAtomicMemCpyAsLoop(AtomicMemCpyInst *AtomicMemcpy,
986                                     const TargetTransformInfo &TTI,
987                                     ScalarEvolution *SE) {
988   if (ConstantInt *CI = dyn_cast<ConstantInt>(AtomicMemcpy->getLength())) {
989     createMemCpyLoopKnownSize(
990         /* InsertBefore */ AtomicMemcpy,
991         /* SrcAddr */ AtomicMemcpy->getRawSource(),
992         /* DstAddr */ AtomicMemcpy->getRawDest(),
993         /* CopyLen */ CI,
994         /* SrcAlign */ AtomicMemcpy->getSourceAlign().valueOrOne(),
995         /* DestAlign */ AtomicMemcpy->getDestAlign().valueOrOne(),
996         /* SrcIsVolatile */ AtomicMemcpy->isVolatile(),
997         /* DstIsVolatile */ AtomicMemcpy->isVolatile(),
998         /* CanOverlap */ false, // SrcAddr & DstAddr may not overlap by spec.
999         /* TargetTransformInfo */ TTI,
1000         /* AtomicCpySize */ AtomicMemcpy->getElementSizeInBytes());
1001   } else {
1002     createMemCpyLoopUnknownSize(
1003         /* InsertBefore */ AtomicMemcpy,
1004         /* SrcAddr */ AtomicMemcpy->getRawSource(),
1005         /* DstAddr */ AtomicMemcpy->getRawDest(),
1006         /* CopyLen */ AtomicMemcpy->getLength(),
1007         /* SrcAlign */ AtomicMemcpy->getSourceAlign().valueOrOne(),
1008         /* DestAlign */ AtomicMemcpy->getDestAlign().valueOrOne(),
1009         /* SrcIsVolatile */ AtomicMemcpy->isVolatile(),
1010         /* DstIsVolatile */ AtomicMemcpy->isVolatile(),
1011         /* CanOverlap */ false, // SrcAddr & DstAddr may not overlap by spec.
1012         /* TargetTransformInfo */ TTI,
1013         /* AtomicCpySize */ AtomicMemcpy->getElementSizeInBytes());
1014   }
1015 }
1016