1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; Check the default works 3; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering %s | FileCheck -check-prefixes=OPT,MAX1024 %s 4; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=pre-isel-intrinsic-lowering %s | FileCheck -check-prefixes=OPT,MAX1024 %s 5 6; Check the default explicitly set works 7; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s 8; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefixes=OPT,ALL %s 9; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s 10; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefixes=OPT,ALL %s 11 12declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1 13declare void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1 14declare void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) nocapture, ptr addrspace(1) nocapture readonly, i32, i1) #1 15declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1 16declare void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1 17 18declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1 19declare void @llvm.memmove.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1 20declare void @llvm.memmove.p0.p3.i32(ptr nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1 immarg) #1 21declare void @llvm.memmove.p3.p0.i32(ptr addrspace(3) nocapture writeonly, ptr nocapture readonly, i32, i1 immarg) #1 22declare void @llvm.memmove.p3.p3.i32(ptr addrspace(3) nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1 immarg) #1 23declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1 24declare void @llvm.memmove.p3.p5.i32(ptr addrspace(3) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1 25declare void @llvm.memmove.p5.p3.i32(ptr addrspace(5) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1 26declare void @llvm.memmove.p0.p1.i64(ptr nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #1 27declare void @llvm.memmove.p1.p0.i64(ptr addrspace(1) nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1 28declare void @llvm.memmove.p5.p1.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #1 29declare void @llvm.memmove.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #1 30declare void @llvm.memmove.p0.p5.i64(ptr nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #1 31declare void @llvm.memmove.p5.p0.i64(ptr addrspace(5) nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1 32declare void @llvm.memmove.p1.p999.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(999) nocapture readonly, i64, i1 immarg) #1 33declare void @llvm.memmove.p999.p1.i64(ptr addrspace(999) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #1 34declare void @llvm.memmove.p999.p998.i64(ptr addrspace(999) nocapture writeonly, ptr addrspace(998) nocapture readonly, i64, i1 immarg) #1 35 36declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture, i8, i64, i1) #1 37 38; Test the upper bound for sizes to leave 39define amdgpu_kernel void @max_size_small_static_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 40; MAX1024-LABEL: @max_size_small_static_memcpy_caller0( 41; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 1024, i1 false) 42; MAX1024-NEXT: ret void 43; 44; ALL-LABEL: @max_size_small_static_memcpy_caller0( 45; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 46; ALL: load-store-loop: 47; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 48; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 49; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1 50; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 51; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1 52; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 53; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 54; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 55; ALL: memcpy-split: 56; ALL-NEXT: ret void 57; 58 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1024, i1 false) 59 ret void 60} 61 62; Smallest static size which will be expanded 63define amdgpu_kernel void @min_size_large_static_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 64; OPT-LABEL: @min_size_large_static_memcpy_caller0( 65; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 66; OPT: load-store-loop: 67; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 68; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 69; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1 70; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 71; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1 72; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 73; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 74; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 75; OPT: memcpy-split: 76; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 77; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 1 78; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 79; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[TMP8]], align 1 80; OPT-NEXT: ret void 81; 82 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1025, i1 false) 83 ret void 84} 85 86define amdgpu_kernel void @max_size_small_static_memmove_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 87; MAX1024-LABEL: @max_size_small_static_memmove_caller0( 88; MAX1024-NEXT: call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 1024, i1 false) 89; MAX1024-NEXT: ret void 90; 91; ALL-LABEL: @max_size_small_static_memmove_caller0( 92; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]] 93; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] 94; ALL: memmove_bwd_loop: 95; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[TMP0:%.*]] ] 96; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP1]], 256 97; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]] 98; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP2]], align 1 99; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]] 100; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1 101; ALL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 102; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] 103; ALL: memmove_fwd_loop: 104; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] 105; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]] 106; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1 107; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]] 108; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1 109; ALL-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 256 110; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 1024 111; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] 112; ALL: memmove_done: 113; ALL-NEXT: ret void 114; 115 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1024, i1 false) 116 ret void 117} 118 119define amdgpu_kernel void @min_size_large_static_memmove_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 120; OPT-LABEL: @min_size_large_static_memmove_caller0( 121; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]] 122; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] 123; OPT: memmove_bwd_residual: 124; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 125; OPT-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1 126; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 127; OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1 128; OPT-NEXT: br label [[MEMMOVE_BWD_LOOP:%.*]] 129; OPT: memmove_bwd_loop: 130; OPT-NEXT: [[TMP4:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[MEMMOVE_BWD_RESIDUAL]] ] 131; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP4]], 256 132; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]] 133; OPT-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1 134; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]] 135; OPT-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP6]], align 1 136; OPT-NEXT: [[TMP7:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 137; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] 138; OPT: memmove_fwd_loop: 139; OPT-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP10:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ] 140; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]] 141; OPT-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP8]], align 1 142; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]] 143; OPT-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1 144; OPT-NEXT: [[TMP10]] = add i64 [[FWD_INDEX]], 256 145; OPT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 1024 146; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]] 147; OPT: memmove_fwd_residual: 148; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 149; OPT-NEXT: [[TMP13:%.*]] = load i8, ptr addrspace(1) [[TMP12]], align 1 150; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 151; OPT-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[TMP14]], align 1 152; OPT-NEXT: br label [[MEMMOVE_DONE]] 153; OPT: memmove_done: 154; OPT-NEXT: ret void 155; 156 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1025, i1 false) 157 ret void 158} 159 160define amdgpu_kernel void @max_size_small_static_memset_caller0(ptr addrspace(1) %dst, i8 %val) #0 { 161; MAX1024-LABEL: @max_size_small_static_memset_caller0( 162; MAX1024-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 1024, i1 false) 163; MAX1024-NEXT: ret void 164; 165; ALL-LABEL: @max_size_small_static_memset_caller0( 166; ALL-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] 167; ALL: loadstoreloop: 168; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] 169; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]] 170; ALL-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1 171; ALL-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 172; ALL-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1024 173; ALL-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] 174; ALL: split: 175; ALL-NEXT: ret void 176; 177 call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 1024, i1 false) 178 ret void 179} 180 181define amdgpu_kernel void @min_size_large_static_memset_caller0(ptr addrspace(1) %dst, i8 %val) #0 { 182; OPT-LABEL: @min_size_large_static_memset_caller0( 183; OPT-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] 184; OPT: loadstoreloop: 185; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] 186; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]] 187; OPT-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1 188; OPT-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 189; OPT-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1025 190; OPT-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] 191; OPT: split: 192; OPT-NEXT: ret void 193; 194 call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 1025, i1 false) 195 ret void 196} 197 198define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 { 199; OPT-LABEL: @variable_memcpy_caller0( 200; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15 201; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] 202; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 203; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 204; OPT: loop-memcpy-expansion: 205; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 206; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 207; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 208; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 209; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1 210; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 211; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] 212; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 213; OPT: loop-memcpy-residual: 214; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 215; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 216; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] 217; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 218; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]] 219; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 220; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 221; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] 222; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 223; OPT: post-loop-memcpy-expansion: 224; OPT-NEXT: ret void 225; OPT: loop-memcpy-residual-header: 226; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 227; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 228; 229 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n, i1 false) 230 ret void 231} 232 233define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 { 234; OPT-LABEL: @variable_memcpy_caller1( 235; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15 236; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] 237; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 238; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 239; OPT: loop-memcpy-expansion: 240; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 241; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 242; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 243; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 244; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1 245; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 246; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] 247; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 248; OPT: loop-memcpy-residual: 249; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 250; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 251; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] 252; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 253; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]] 254; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 255; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 256; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] 257; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 258; OPT: post-loop-memcpy-expansion: 259; OPT-NEXT: ret void 260; OPT: loop-memcpy-residual-header: 261; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 262; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 263; 264 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n, i1 false) 265 ret void 266} 267 268define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n, i64 %m) #0 { 269; OPT-LABEL: @memcpy_multi_use_one_function( 270; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15 271; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] 272; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 273; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION2:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5:%.*]] 274; OPT: loop-memcpy-expansion2: 275; OPT-NEXT: [[LOOP_INDEX3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION2]] ] 276; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX3]] 277; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 278; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0:%.*]], i64 [[LOOP_INDEX3]] 279; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1 280; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX3]], 16 281; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] 282; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION2]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5]] 283; OPT: loop-memcpy-residual4: 284; OPT-NEXT: [[RESIDUAL_LOOP_INDEX6:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER5]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL4:%.*]] ] 285; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX6]] 286; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] 287; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 288; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0]], i64 [[TMP10]] 289; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 290; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX6]], 1 291; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] 292; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]] 293; OPT: post-loop-memcpy-expansion1: 294; OPT-NEXT: [[TMP17:%.*]] = and i64 [[M:%.*]], 15 295; OPT-NEXT: [[TMP18:%.*]] = sub i64 [[M]], [[TMP17]] 296; OPT-NEXT: [[TMP19:%.*]] = icmp ne i64 [[TMP18]], 0 297; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 298; OPT: loop-memcpy-expansion: 299; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION1]] ], [ [[TMP23:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 300; OPT-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]] 301; OPT-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP20]], align 1 302; OPT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1:%.*]], i64 [[LOOP_INDEX]] 303; OPT-NEXT: store <4 x i32> [[TMP21]], ptr addrspace(1) [[TMP22]], align 1 304; OPT-NEXT: [[TMP23]] = add i64 [[LOOP_INDEX]], 16 305; OPT-NEXT: [[TMP24:%.*]] = icmp ult i64 [[TMP23]], [[TMP18]] 306; OPT-NEXT: br i1 [[TMP24]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 307; OPT: loop-memcpy-residual: 308; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP29:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 309; OPT-NEXT: [[TMP25:%.*]] = add i64 [[TMP18]], [[RESIDUAL_LOOP_INDEX]] 310; OPT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP25]] 311; OPT-NEXT: [[TMP27:%.*]] = load i8, ptr addrspace(1) [[TMP26]], align 1 312; OPT-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 [[TMP25]] 313; OPT-NEXT: store i8 [[TMP27]], ptr addrspace(1) [[TMP28]], align 1 314; OPT-NEXT: [[TMP29]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 315; OPT-NEXT: [[TMP30:%.*]] = icmp ult i64 [[TMP29]], [[TMP17]] 316; OPT-NEXT: br i1 [[TMP30]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 317; OPT: post-loop-memcpy-expansion: 318; OPT-NEXT: ret void 319; OPT: loop-memcpy-residual-header: 320; OPT-NEXT: [[TMP31:%.*]] = icmp ne i64 [[TMP17]], 0 321; OPT-NEXT: br i1 [[TMP31]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 322; OPT: loop-memcpy-residual-header5: 323; OPT-NEXT: [[TMP32:%.*]] = icmp ne i64 [[TMP2]], 0 324; OPT-NEXT: br i1 [[TMP32]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1]] 325; 326 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst0, ptr addrspace(1) %src, i64 %n, i1 false) 327 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %m, i1 false) 328 ret void 329} 330 331define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 { 332; OPT-LABEL: @memcpy_alt_type( 333; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 334; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] 335; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 336; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 337; OPT: loop-memcpy-expansion: 338; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 339; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] 340; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1 341; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]] 342; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1 343; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 344; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] 345; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 346; OPT: loop-memcpy-residual: 347; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 348; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 349; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] 350; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1 351; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i32 [[TMP10]] 352; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 353; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 354; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] 355; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 356; OPT: post-loop-memcpy-expansion: 357; OPT-NEXT: ret void 358; OPT: loop-memcpy-residual-header: 359; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 360; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 361; 362 call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n, i1 false) 363 ret void 364} 365 366; One of the uses in the function should be expanded, the other left alone. 367define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n) #0 { 368; MAX1024-LABEL: @memcpy_multi_use_one_function_keep_small( 369; MAX1024-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15 370; MAX1024-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] 371; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 372; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 373; MAX1024: loop-memcpy-expansion: 374; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 375; MAX1024-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 376; MAX1024-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 377; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0:%.*]], i64 [[LOOP_INDEX]] 378; MAX1024-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1 379; MAX1024-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 380; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] 381; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 382; MAX1024: loop-memcpy-residual: 383; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 384; MAX1024-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 385; MAX1024-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] 386; MAX1024-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 387; MAX1024-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0]], i64 [[TMP10]] 388; MAX1024-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 389; MAX1024-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 390; MAX1024-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] 391; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 392; MAX1024: post-loop-memcpy-expansion: 393; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST1:%.*]], ptr addrspace(1) [[SRC]], i64 102, i1 false) 394; MAX1024-NEXT: ret void 395; MAX1024: loop-memcpy-residual-header: 396; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 397; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 398; 399; ALL-LABEL: @memcpy_multi_use_one_function_keep_small( 400; ALL-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15 401; ALL-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] 402; ALL-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 403; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 404; ALL: loop-memcpy-expansion: 405; ALL-NEXT: [[LOOP_INDEX1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 406; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX1]] 407; ALL-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 408; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0:%.*]], i64 [[LOOP_INDEX1]] 409; ALL-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1 410; ALL-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX1]], 16 411; ALL-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] 412; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 413; ALL: loop-memcpy-residual: 414; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 415; ALL-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 416; ALL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] 417; ALL-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 418; ALL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0]], i64 [[TMP10]] 419; ALL-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 420; ALL-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 421; ALL-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] 422; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 423; ALL: post-loop-memcpy-expansion: 424; ALL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 0 425; ALL-NEXT: [[TMP17:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP16]], align 1 426; ALL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1:%.*]], i64 0 427; ALL-NEXT: store <4 x i32> [[TMP17]], ptr addrspace(1) [[TMP18]], align 1 428; ALL-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 16 429; ALL-NEXT: [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP33]], align 1 430; ALL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 16 431; ALL-NEXT: store <4 x i32> [[TMP19]], ptr addrspace(1) [[TMP20]], align 1 432; ALL-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 32 433; ALL-NEXT: [[TMP35:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP34]], align 1 434; ALL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 32 435; ALL-NEXT: store <4 x i32> [[TMP35]], ptr addrspace(1) [[TMP36]], align 1 436; ALL-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 48 437; ALL-NEXT: [[TMP38:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP37]], align 1 438; ALL-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 48 439; ALL-NEXT: store <4 x i32> [[TMP38]], ptr addrspace(1) [[TMP39]], align 1 440; ALL-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 64 441; ALL-NEXT: [[TMP28:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP40]], align 1 442; ALL-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 64 443; ALL-NEXT: store <4 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 1 444; ALL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 80 445; ALL-NEXT: [[TMP31:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP30]], align 1 446; ALL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 80 447; ALL-NEXT: store <4 x i32> [[TMP31]], ptr addrspace(1) [[TMP32]], align 1 448; ALL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 96 449; ALL-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 1 450; ALL-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 96 451; ALL-NEXT: store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 1 452; ALL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 100 453; ALL-NEXT: [[TMP25:%.*]] = load i16, ptr addrspace(1) [[TMP24]], align 1 454; ALL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 100 455; ALL-NEXT: store i16 [[TMP25]], ptr addrspace(1) [[TMP26]], align 1 456; ALL-NEXT: ret void 457; ALL: loop-memcpy-residual-header: 458; ALL-NEXT: [[TMP27:%.*]] = icmp ne i64 [[TMP2]], 0 459; ALL-NEXT: br i1 [[TMP27]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 460; 461 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst0, ptr addrspace(1) %src, i64 %n, i1 false) 462 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 102, i1 false) 463 ret void 464} 465 466define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 467; OPT-LABEL: @memcpy_global_align4_global_align4_1028( 468; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 469; OPT: load-store-loop: 470; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 471; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 472; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 473; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 474; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 475; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 476; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 477; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 478; OPT: memcpy-split: 479; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 480; OPT-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 481; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 482; OPT-NEXT: store i32 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4 483; OPT-NEXT: ret void 484; 485 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1028, i1 false) 486 ret void 487} 488 489define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 490; OPT-LABEL: @memcpy_global_align4_global_align4_1025( 491; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 492; OPT: load-store-loop: 493; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 494; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 495; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 496; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 497; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 498; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 499; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 500; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 501; OPT: memcpy-split: 502; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 503; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 4 504; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 505; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4 506; OPT-NEXT: ret void 507; 508 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1025, i1 false) 509 ret void 510} 511 512define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 513; OPT-LABEL: @memcpy_global_align4_global_align4_1026( 514; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 515; OPT: load-store-loop: 516; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 517; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 518; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 519; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 520; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 521; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 522; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 523; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 524; OPT: memcpy-split: 525; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 526; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(1) [[TMP6]], align 4 527; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 528; OPT-NEXT: store i16 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4 529; OPT-NEXT: ret void 530; 531 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1026, i1 false) 532 ret void 533} 534 535define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 536; OPT-LABEL: @memcpy_global_align4_global_align4_1032( 537; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 538; OPT: load-store-loop: 539; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 540; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 541; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 542; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 543; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 544; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 545; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 546; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 547; OPT: memcpy-split: 548; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 549; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4 550; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 551; OPT-NEXT: store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4 552; OPT-NEXT: ret void 553; 554 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1032, i1 false) 555 ret void 556} 557 558define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 559; OPT-LABEL: @memcpy_global_align4_global_align4_1034( 560; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 561; OPT: load-store-loop: 562; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 563; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 564; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 565; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 566; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 567; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 568; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 569; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 570; OPT: memcpy-split: 571; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 572; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4 573; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 574; OPT-NEXT: store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4 575; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032 576; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4 577; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032 578; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 579; OPT-NEXT: ret void 580; 581 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1034, i1 false) 582 ret void 583} 584 585define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 586; OPT-LABEL: @memcpy_global_align4_global_align4_1035( 587; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 588; OPT: load-store-loop: 589; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 590; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 591; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 592; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 593; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 594; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 595; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 596; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 597; OPT: memcpy-split: 598; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 599; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4 600; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 601; OPT-NEXT: store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4 602; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032 603; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4 604; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032 605; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 606; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1034 607; OPT-NEXT: [[TMP13:%.*]] = load i8, ptr addrspace(1) [[TMP12]], align 2 608; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1034 609; OPT-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[TMP14]], align 2 610; OPT-NEXT: ret void 611; 612 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1035, i1 false) 613 ret void 614} 615 616define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 617; OPT-LABEL: @memcpy_global_align4_global_align4_1036( 618; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 619; OPT: load-store-loop: 620; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 621; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 622; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 623; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 624; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 625; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 626; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 627; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 628; OPT: memcpy-split: 629; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 630; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4 631; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 632; OPT-NEXT: store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4 633; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032 634; OPT-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP9]], align 4 635; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032 636; OPT-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 637; OPT-NEXT: ret void 638; 639 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1036, i1 false) 640 ret void 641} 642 643define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 644; OPT-LABEL: @memcpy_global_align4_global_align4_1039( 645; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 646; OPT: load-store-loop: 647; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 648; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 649; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 650; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 651; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 652; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 653; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 654; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 655; OPT: memcpy-split: 656; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 657; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4 658; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 659; OPT-NEXT: store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4 660; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032 661; OPT-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP9]], align 4 662; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032 663; OPT-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 664; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036 665; OPT-NEXT: [[TMP13:%.*]] = load i16, ptr addrspace(1) [[TMP12]], align 4 666; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036 667; OPT-NEXT: store i16 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4 668; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038 669; OPT-NEXT: [[TMP16:%.*]] = load i8, ptr addrspace(1) [[TMP15]], align 2 670; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038 671; OPT-NEXT: store i8 [[TMP16]], ptr addrspace(1) [[TMP17]], align 2 672; OPT-NEXT: ret void 673; 674 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1039, i1 false) 675 ret void 676} 677 678define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 679; OPT-LABEL: @memcpy_global_align2_global_align2_1039( 680; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 681; OPT: load-store-loop: 682; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 683; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 684; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2 685; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 686; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 687; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2 688; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1038 689; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 690; OPT: memcpy-split: 691; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038 692; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2 693; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038 694; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[TMP8]], align 2 695; OPT-NEXT: ret void 696; 697 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 %src, i64 1039, i1 false) 698 ret void 699} 700 701define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 702; OPT-LABEL: @memcpy_global_align4_global_align4_1027( 703; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 704; OPT: load-store-loop: 705; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 706; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 707; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 708; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 709; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 710; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 711; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 712; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 713; OPT: memcpy-split: 714; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 715; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(1) [[TMP6]], align 4 716; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 717; OPT-NEXT: store i16 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4 718; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026 719; OPT-NEXT: [[TMP10:%.*]] = load i8, ptr addrspace(1) [[TMP9]], align 2 720; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026 721; OPT-NEXT: store i8 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2 722; OPT-NEXT: ret void 723; 724 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1027, i1 false) 725 ret void 726} 727 728define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 729; OPT-LABEL: @memcpy_global_align2_global_align4_1027( 730; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 731; OPT: load-store-loop: 732; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 733; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 734; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2 735; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 736; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 737; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2 738; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026 739; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 740; OPT: memcpy-split: 741; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026 742; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2 743; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026 744; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[TMP8]], align 2 745; OPT-NEXT: ret void 746; 747 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 4 %src, i64 1027, i1 false) 748 ret void 749} 750 751define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 752; OPT-LABEL: @memcpy_global_align4_global_align2_1027( 753; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 754; OPT: load-store-loop: 755; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 756; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 757; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2 758; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 759; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 760; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2 761; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026 762; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 763; OPT: memcpy-split: 764; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026 765; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2 766; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026 767; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[TMP8]], align 2 768; OPT-NEXT: ret void 769; 770 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 2 %src, i64 1027, i1 false) 771 ret void 772} 773 774define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 { 775; OPT-LABEL: @memcpy_private_align4_private_align4_1027( 776; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 777; OPT: load-store-loop: 778; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 779; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] 780; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4 781; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] 782; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4 783; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 784; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 785; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 786; OPT: memcpy-split: 787; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 788; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 4 789; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 790; OPT-NEXT: store i16 [[TMP7]], ptr addrspace(5) [[TMP8]], align 4 791; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026 792; OPT-NEXT: [[TMP10:%.*]] = load i8, ptr addrspace(5) [[TMP9]], align 2 793; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026 794; OPT-NEXT: store i8 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2 795; OPT-NEXT: ret void 796; 797 call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %dst, ptr addrspace(5) align 4 %src, i32 1027, i1 false) 798 ret void 799} 800 801define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 { 802; OPT-LABEL: @memcpy_private_align2_private_align4_1027( 803; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 804; OPT: load-store-loop: 805; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 806; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] 807; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2 808; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] 809; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2 810; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2 811; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026 812; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 813; OPT: memcpy-split: 814; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026 815; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2 816; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026 817; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(5) [[TMP8]], align 2 818; OPT-NEXT: ret void 819; 820 call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 2 %dst, ptr addrspace(5) align 4 %src, i32 1027, i1 false) 821 ret void 822} 823 824define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 { 825; OPT-LABEL: @memcpy_private_align1_private_align4_1027( 826; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 827; OPT: load-store-loop: 828; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 829; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] 830; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4 831; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] 832; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1 833; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 834; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 835; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 836; OPT: memcpy-split: 837; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 838; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 4 839; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 840; OPT-NEXT: store i16 [[TMP7]], ptr addrspace(5) [[TMP8]], align 1 841; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026 842; OPT-NEXT: [[TMP10:%.*]] = load i8, ptr addrspace(5) [[TMP9]], align 2 843; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026 844; OPT-NEXT: store i8 [[TMP10]], ptr addrspace(5) [[TMP11]], align 1 845; OPT-NEXT: ret void 846; 847 call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 4 %src, i32 1027, i1 false) 848 ret void 849} 850 851define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 { 852; OPT-LABEL: @memcpy_private_align4_private_align2_1027( 853; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 854; OPT: load-store-loop: 855; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 856; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] 857; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2 858; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] 859; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2 860; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2 861; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026 862; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 863; OPT: memcpy-split: 864; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026 865; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2 866; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026 867; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(5) [[TMP8]], align 2 868; OPT-NEXT: ret void 869; 870 call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %dst, ptr addrspace(5) align 2 %src, i32 1027, i1 false) 871 ret void 872} 873 874define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 { 875; OPT-LABEL: @memcpy_private_align4_private_align1_1027( 876; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 877; OPT: load-store-loop: 878; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 879; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] 880; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1 881; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] 882; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4 883; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 884; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 885; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 886; OPT: memcpy-split: 887; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 888; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 1 889; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 890; OPT-NEXT: store i16 [[TMP7]], ptr addrspace(5) [[TMP8]], align 4 891; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026 892; OPT-NEXT: [[TMP10:%.*]] = load i8, ptr addrspace(5) [[TMP9]], align 1 893; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026 894; OPT-NEXT: store i8 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2 895; OPT-NEXT: ret void 896; 897 call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %dst, ptr addrspace(5) align 1 %src, i32 1027, i1 false) 898 ret void 899} 900 901define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 { 902; OPT-LABEL: @memcpy_private_align2_private_align2_1027( 903; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 904; OPT: load-store-loop: 905; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 906; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] 907; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2 908; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] 909; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2 910; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2 911; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026 912; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 913; OPT: memcpy-split: 914; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026 915; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2 916; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026 917; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(5) [[TMP8]], align 2 918; OPT-NEXT: ret void 919; 920 call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 2 %dst, ptr addrspace(5) align 2 %src, i32 1027, i1 false) 921 ret void 922} 923 924define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 { 925; OPT-LABEL: @memcpy_global_align4_global_align4_variable( 926; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15 927; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] 928; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 929; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 930; OPT: loop-memcpy-expansion: 931; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 932; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 933; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 4 934; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 935; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 936; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 937; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] 938; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 939; OPT: loop-memcpy-residual: 940; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 941; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 942; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] 943; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 944; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]] 945; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 946; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 947; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] 948; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 949; OPT: post-loop-memcpy-expansion: 950; OPT-NEXT: ret void 951; OPT: loop-memcpy-residual-header: 952; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 953; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 954; 955 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 %n, i1 false) 956 ret void 957} 958 959define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 { 960; OPT-LABEL: @memcpy_global_align2_global_align2_variable( 961; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 1 962; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] 963; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 964; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 965; OPT: loop-memcpy-expansion: 966; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 967; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 968; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(1) [[TMP5]], align 2 969; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 970; OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[TMP7]], align 2 971; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 2 972; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] 973; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 974; OPT: loop-memcpy-residual: 975; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 976; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 977; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] 978; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 979; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]] 980; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 981; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 982; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] 983; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 984; OPT: post-loop-memcpy-expansion: 985; OPT-NEXT: ret void 986; OPT: loop-memcpy-residual-header: 987; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 988; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 989; 990 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 %src, i64 %n, i1 false) 991 ret void 992} 993 994define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 { 995; OPT-LABEL: @memcpy_global_align1_global_align1_variable( 996; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15 997; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] 998; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 999; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1000; OPT: loop-memcpy-expansion: 1001; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1002; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 1003; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1 1004; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 1005; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1 1006; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 1007; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] 1008; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1009; OPT: loop-memcpy-residual: 1010; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1011; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 1012; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] 1013; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 1014; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]] 1015; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 1016; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 1017; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] 1018; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1019; OPT: post-loop-memcpy-expansion: 1020; OPT-NEXT: ret void 1021; OPT: loop-memcpy-residual-header: 1022; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 1023; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1024; 1025 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 %src, i64 %n, i1 false) 1026 ret void 1027} 1028 1029define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 { 1030; OPT-LABEL: @memcpy_local_align4_local_align4_variable( 1031; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 1032; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] 1033; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 1034; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1035; OPT: loop-memcpy-expansion: 1036; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1037; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] 1038; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4 1039; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] 1040; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4 1041; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 1042; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] 1043; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1044; OPT: loop-memcpy-residual: 1045; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1046; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 1047; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] 1048; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1 1049; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]] 1050; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1 1051; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1052; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] 1053; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1054; OPT: post-loop-memcpy-expansion: 1055; OPT-NEXT: ret void 1056; OPT: loop-memcpy-residual-header: 1057; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 1058; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1059; 1060 call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 4 %dst, ptr addrspace(3) align 4 %src, i32 %n, i1 false) 1061 ret void 1062} 1063 1064define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 { 1065; OPT-LABEL: @memcpy_local_align2_local_align2_variable( 1066; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 1 1067; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] 1068; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 1069; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1070; OPT: loop-memcpy-expansion: 1071; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1072; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] 1073; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(3) [[TMP5]], align 2 1074; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] 1075; OPT-NEXT: store i16 [[TMP6]], ptr addrspace(3) [[TMP7]], align 2 1076; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 2 1077; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] 1078; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1079; OPT: loop-memcpy-residual: 1080; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1081; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 1082; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] 1083; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1 1084; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]] 1085; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1 1086; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1087; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] 1088; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1089; OPT: post-loop-memcpy-expansion: 1090; OPT-NEXT: ret void 1091; OPT: loop-memcpy-residual-header: 1092; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 1093; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1094; 1095 call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 2 %dst, ptr addrspace(3) align 2 %src, i32 %n, i1 false) 1096 ret void 1097} 1098 1099define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 { 1100; OPT-LABEL: @memcpy_local_align1_local_align1_variable( 1101; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 1102; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] 1103; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 1104; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1105; OPT: loop-memcpy-expansion: 1106; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1107; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] 1108; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1 1109; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] 1110; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1 1111; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 1112; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] 1113; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1114; OPT: loop-memcpy-residual: 1115; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1116; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 1117; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] 1118; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1 1119; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]] 1120; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1 1121; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1122; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] 1123; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1124; OPT: post-loop-memcpy-expansion: 1125; OPT-NEXT: ret void 1126; OPT: loop-memcpy-residual-header: 1127; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 1128; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1129; 1130 call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 %src, i32 %n, i1 false) 1131 ret void 1132} 1133 1134define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrspace(3) %dst, ptr addrspace(1) %src, i32 %n) #0 { 1135; OPT-LABEL: @memcpy_local_align4_global_align4_variable( 1136; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 1137; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] 1138; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 1139; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1140; OPT: loop-memcpy-expansion: 1141; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1142; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i32 [[LOOP_INDEX]] 1143; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP5]], align 4 1144; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] 1145; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4 1146; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 1147; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] 1148; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1149; OPT: loop-memcpy-residual: 1150; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1151; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 1152; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[TMP10]] 1153; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 1154; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]] 1155; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1 1156; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1157; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] 1158; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1159; OPT: post-loop-memcpy-expansion: 1160; OPT-NEXT: ret void 1161; OPT: loop-memcpy-residual-header: 1162; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 1163; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1164; 1165 call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 %dst, ptr addrspace(1) align 4 %src, i32 %n, i1 false) 1166 ret void 1167} 1168 1169define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 { 1170; OPT-LABEL: @memcpy_global_align4_local_align4_variable( 1171; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 1172; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] 1173; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 1174; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1175; OPT: loop-memcpy-expansion: 1176; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1177; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] 1178; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4 1179; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]] 1180; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 1181; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 1182; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] 1183; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1184; OPT: loop-memcpy-residual: 1185; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1186; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 1187; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] 1188; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1 1189; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i32 [[TMP10]] 1190; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 1191; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1192; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] 1193; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1194; OPT: post-loop-memcpy-expansion: 1195; OPT-NEXT: ret void 1196; OPT: loop-memcpy-residual-header: 1197; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 1198; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1199; 1200 call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %dst, ptr addrspace(3) align 4 %src, i32 %n, i1 false) 1201 ret void 1202} 1203 1204define amdgpu_kernel void @memcpy_global_align4_global_align4_16(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 1205; MAX1024-LABEL: @memcpy_global_align4_global_align4_16( 1206; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 16, i1 false) 1207; MAX1024-NEXT: ret void 1208; 1209; ALL-LABEL: @memcpy_global_align4_global_align4_16( 1210; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0 1211; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4 1212; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0 1213; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 1214; ALL-NEXT: ret void 1215; 1216 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 16, i1 false) 1217 ret void 1218} 1219 1220define amdgpu_kernel void @memcpy_global_align4_global_align4_12(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 1221; MAX1024-LABEL: @memcpy_global_align4_global_align4_12( 1222; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 12, i1 false) 1223; MAX1024-NEXT: ret void 1224; 1225; ALL-LABEL: @memcpy_global_align4_global_align4_12( 1226; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0 1227; ALL-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4 1228; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0 1229; ALL-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 1230; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 8 1231; ALL-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(1) [[TMP4]], align 4 1232; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 8 1233; ALL-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 1234; ALL-NEXT: ret void 1235; 1236 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 12, i1 false) 1237 ret void 1238} 1239 1240define amdgpu_kernel void @memcpy_global_align4_global_align4_8(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 1241; MAX1024-LABEL: @memcpy_global_align4_global_align4_8( 1242; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 8, i1 false) 1243; MAX1024-NEXT: ret void 1244; 1245; ALL-LABEL: @memcpy_global_align4_global_align4_8( 1246; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0 1247; ALL-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4 1248; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0 1249; ALL-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 1250; ALL-NEXT: ret void 1251; 1252 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 8, i1 false) 1253 ret void 1254} 1255 1256define amdgpu_kernel void @memcpy_global_align4_global_align4_10(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 1257; MAX1024-LABEL: @memcpy_global_align4_global_align4_10( 1258; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 10, i1 false) 1259; MAX1024-NEXT: ret void 1260; 1261; ALL-LABEL: @memcpy_global_align4_global_align4_10( 1262; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0 1263; ALL-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4 1264; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0 1265; ALL-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 1266; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 8 1267; ALL-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(1) [[TMP4]], align 4 1268; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 8 1269; ALL-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 1270; ALL-NEXT: ret void 1271; 1272 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 10, i1 false) 1273 ret void 1274} 1275 1276define amdgpu_kernel void @memcpy_global_align4_global_align4_4(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 1277; MAX1024-LABEL: @memcpy_global_align4_global_align4_4( 1278; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 4, i1 false) 1279; MAX1024-NEXT: ret void 1280; 1281; ALL-LABEL: @memcpy_global_align4_global_align4_4( 1282; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0 1283; ALL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4 1284; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0 1285; ALL-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 1286; ALL-NEXT: ret void 1287; 1288 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 4, i1 false) 1289 ret void 1290} 1291 1292define amdgpu_kernel void @memcpy_global_align4_global_align4_2(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 1293; MAX1024-LABEL: @memcpy_global_align4_global_align4_2( 1294; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 2, i1 false) 1295; MAX1024-NEXT: ret void 1296; 1297; ALL-LABEL: @memcpy_global_align4_global_align4_2( 1298; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0 1299; ALL-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 4 1300; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0 1301; ALL-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 1302; ALL-NEXT: ret void 1303; 1304 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 2, i1 false) 1305 ret void 1306} 1307 1308define amdgpu_kernel void @memcpy_global_align4_global_align4_1(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 1309; MAX1024-LABEL: @memcpy_global_align4_global_align4_1( 1310; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 1, i1 false) 1311; MAX1024-NEXT: ret void 1312; 1313; ALL-LABEL: @memcpy_global_align4_global_align4_1( 1314; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0 1315; ALL-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 4 1316; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0 1317; ALL-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 1318; ALL-NEXT: ret void 1319; 1320 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1, i1 false) 1321 ret void 1322} 1323 1324define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrspace(1) %src) { 1325; MAX1024-LABEL: @memmove_flat_align1_global_align1( 1326; MAX1024-NEXT: call void @llvm.memmove.p0.p1.i64(ptr [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 256, i1 false) 1327; MAX1024-NEXT: ret void 1328; 1329; ALL-LABEL: @memmove_flat_align1_global_align1( 1330; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(1) 1331; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[TMP1]] 1332; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] 1333; ALL: memmove_bwd_loop: 1334; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] 1335; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 256 1336; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]] 1337; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP3]], align 1 1338; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[BWD_INDEX]] 1339; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr [[TMP4]], align 1 1340; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 1341; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] 1342; ALL: memmove_fwd_loop: 1343; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] 1344; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]] 1345; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP6]], align 1 1346; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[FWD_INDEX]] 1347; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1 1348; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 256 1349; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256 1350; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] 1351; ALL: memmove_done: 1352; ALL-NEXT: ret void 1353; 1354 call void @llvm.memmove.p0.p1.i64(ptr %dst, ptr addrspace(1) %src, i64 256, i1 false) 1355 ret void 1356} 1357 1358define amdgpu_kernel void @memmove_global_align1_flat_align1(ptr addrspace(1) %dst, ptr %src) { 1359; MAX1024-LABEL: @memmove_global_align1_flat_align1( 1360; MAX1024-NEXT: call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) [[DST:%.*]], ptr [[SRC:%.*]], i64 256, i1 false) 1361; MAX1024-NEXT: ret void 1362; 1363; ALL-LABEL: @memmove_global_align1_flat_align1( 1364; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr 1365; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]] 1366; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] 1367; ALL: memmove_bwd_loop: 1368; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] 1369; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 256 1370; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[BWD_INDEX]] 1371; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr [[TMP3]], align 1 1372; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]] 1373; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP4]], align 1 1374; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 1375; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] 1376; ALL: memmove_fwd_loop: 1377; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] 1378; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[FWD_INDEX]] 1379; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr [[TMP6]], align 1 1380; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]] 1381; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP7]], align 1 1382; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 256 1383; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256 1384; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] 1385; ALL: memmove_done: 1386; ALL-NEXT: ret void 1387; 1388 call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) %dst, ptr %src, i64 256, i1 false) 1389 ret void 1390} 1391 1392define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addrspace(5) %src) { 1393; MAX1024-LABEL: @memmove_flat_align1_private_align1( 1394; MAX1024-NEXT: call void @llvm.memmove.p0.p5.i64(ptr [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i64 256, i1 false) 1395; MAX1024-NEXT: ret void 1396; 1397; ALL-LABEL: @memmove_flat_align1_private_align1( 1398; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(5) 1399; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[TMP1]] 1400; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] 1401; ALL: memmove_bwd_loop: 1402; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] 1403; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 256 1404; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i64 [[BWD_INDEX]] 1405; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP3]], align 1 1406; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[BWD_INDEX]] 1407; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr [[TMP4]], align 1 1408; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 1409; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] 1410; ALL: memmove_fwd_loop: 1411; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] 1412; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i64 [[FWD_INDEX]] 1413; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP6]], align 1 1414; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[FWD_INDEX]] 1415; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1 1416; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 256 1417; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256 1418; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] 1419; ALL: memmove_done: 1420; ALL-NEXT: ret void 1421; 1422 call void @llvm.memmove.p0.p5.i64(ptr %dst, ptr addrspace(5) %src, i64 256, i1 false) 1423 ret void 1424} 1425 1426define amdgpu_kernel void @memmove_private_align1_flat_align1(ptr addrspace(5) %dst, ptr %src) { 1427; MAX1024-LABEL: @memmove_private_align1_flat_align1( 1428; MAX1024-NEXT: call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) [[DST:%.*]], ptr [[SRC:%.*]], i64 256, i1 false) 1429; MAX1024-NEXT: ret void 1430; 1431; ALL-LABEL: @memmove_private_align1_flat_align1( 1432; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DST:%.*]] to ptr 1433; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]] 1434; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] 1435; ALL: memmove_bwd_loop: 1436; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] 1437; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 256 1438; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[BWD_INDEX]] 1439; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr [[TMP3]], align 1 1440; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i64 [[BWD_INDEX]] 1441; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP4]], align 1 1442; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 1443; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] 1444; ALL: memmove_fwd_loop: 1445; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] 1446; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[FWD_INDEX]] 1447; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr [[TMP6]], align 1 1448; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i64 [[FWD_INDEX]] 1449; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP7]], align 1 1450; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 256 1451; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256 1452; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] 1453; ALL: memmove_done: 1454; ALL-NEXT: ret void 1455; 1456 call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) %dst, ptr %src, i64 256, i1 false) 1457 ret void 1458} 1459 1460define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5) %dst, ptr addrspace(1) %src) { 1461; MAX1024-LABEL: @memmove_private_align1_global_align1( 1462; MAX1024-NEXT: call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 256, i1 false) 1463; MAX1024-NEXT: ret void 1464; 1465; ALL-LABEL: @memmove_private_align1_global_align1( 1466; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 1467; ALL: load-store-loop: 1468; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 1469; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 1470; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope [[META0:![0-9]+]] 1471; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i64 [[LOOP_INDEX]] 1472; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META0]] 1473; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 1474; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256 1475; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 1476; ALL: memcpy-split: 1477; ALL-NEXT: ret void 1478; 1479 call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) %dst, ptr addrspace(1) %src, i64 256, i1 false) 1480 ret void 1481} 1482 1483define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1) %dst, ptr addrspace(5) %src) { 1484; MAX1024-LABEL: @memmove_global_align1_private_align1( 1485; MAX1024-NEXT: call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i64 256, i1 false) 1486; MAX1024-NEXT: ret void 1487; 1488; ALL-LABEL: @memmove_global_align1_private_align1( 1489; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 1490; ALL: load-store-loop: 1491; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 1492; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]] 1493; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]] 1494; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 1495; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias [[META3]] 1496; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 1497; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256 1498; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 1499; ALL: memcpy-split: 1500; ALL-NEXT: ret void 1501; 1502 call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) %dst, ptr addrspace(5) %src, i64 256, i1 false) 1503 ret void 1504} 1505 1506define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size) { 1507; OPT-LABEL: @memmove_global_align1_p999_align1( 1508; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE:%.*]], 15 1509; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]] 1510; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0 1511; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP3]], 0 1512; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr addrspace(999) 1513; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(999) [[SRC:%.*]], [[TMP4]] 1514; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]] 1515; OPT: memmove_copy_backwards: 1516; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]] 1517; OPT: memmove_bwd_residual_loop: 1518; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ] 1519; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1 1520; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]] 1521; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(999) [[TMP6]], align 1 1522; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_RESIDUAL_INDEX]] 1523; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(1) [[TMP7]], align 1 1524; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]] 1525; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]] 1526; OPT: memmove_bwd_middle: 1527; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] 1528; OPT: memmove_bwd_main_loop: 1529; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ] 1530; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 16 1531; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[BWD_MAIN_INDEX]] 1532; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(999) [[TMP10]], align 1 1533; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_MAIN_INDEX]] 1534; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP11]], align 1 1535; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0 1536; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] 1537; OPT: memmove_copy_forward: 1538; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]] 1539; OPT: memmove_fwd_main_loop: 1540; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] 1541; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[FWD_MAIN_INDEX]] 1542; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(999) [[TMP13]], align 1 1543; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_MAIN_INDEX]] 1544; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(1) [[TMP14]], align 1 1545; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 16 1546; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP3]] 1547; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] 1548; OPT: memmove_fwd_middle: 1549; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]] 1550; OPT: memmove_fwd_residual_loop: 1551; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ] 1552; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]] 1553; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(999) [[TMP17]], align 1 1554; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_RESIDUAL_INDEX]] 1555; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(1) [[TMP18]], align 1 1556; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1 1557; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]] 1558; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]] 1559; OPT: memmove_done: 1560; OPT-NEXT: ret void 1561; 1562 call void @llvm.memmove.p1.p999.i64(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size, i1 false) 1563 ret void 1564} 1565 1566define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size) { 1567; OPT-LABEL: @memmove_p999_align1_p1_align1( 1568; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE:%.*]], 15 1569; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]] 1570; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0 1571; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP3]], 0 1572; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(999) [[DST:%.*]] to ptr addrspace(1) 1573; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[TMP4]] 1574; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]] 1575; OPT: memmove_copy_backwards: 1576; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]] 1577; OPT: memmove_bwd_residual_loop: 1578; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ] 1579; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1 1580; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]] 1581; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 1 1582; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_RESIDUAL_INDEX]] 1583; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(999) [[TMP7]], align 1 1584; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]] 1585; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]] 1586; OPT: memmove_bwd_middle: 1587; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] 1588; OPT: memmove_bwd_main_loop: 1589; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ] 1590; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 16 1591; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_MAIN_INDEX]] 1592; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP10]], align 1 1593; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]] 1594; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(999) [[TMP11]], align 1 1595; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0 1596; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] 1597; OPT: memmove_copy_forward: 1598; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]] 1599; OPT: memmove_fwd_main_loop: 1600; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] 1601; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_MAIN_INDEX]] 1602; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP13]], align 1 1603; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]] 1604; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(999) [[TMP14]], align 1 1605; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 16 1606; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP3]] 1607; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] 1608; OPT: memmove_fwd_middle: 1609; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]] 1610; OPT: memmove_fwd_residual_loop: 1611; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ] 1612; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]] 1613; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(1) [[TMP17]], align 1 1614; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_RESIDUAL_INDEX]] 1615; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(999) [[TMP18]], align 1 1616; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1 1617; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]] 1618; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]] 1619; OPT: memmove_done: 1620; OPT-NEXT: ret void 1621; 1622 call void @llvm.memmove.p999.p1.i64(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size, i1 false) 1623 ret void 1624} 1625 1626define amdgpu_kernel void @memmove_p999_align1_p998_align1(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size) { 1627; OPT-LABEL: @memmove_p999_align1_p998_align1( 1628; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE:%.*]], 15 1629; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]] 1630; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0 1631; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP3]], 0 1632; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(999) [[DST:%.*]] to ptr addrspace(998) 1633; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(998) [[SRC:%.*]], [[TMP4]] 1634; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]] 1635; OPT: memmove_copy_backwards: 1636; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]] 1637; OPT: memmove_bwd_residual_loop: 1638; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ] 1639; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1 1640; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]] 1641; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(998) [[TMP6]], align 1 1642; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_RESIDUAL_INDEX]] 1643; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(999) [[TMP7]], align 1 1644; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]] 1645; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]] 1646; OPT: memmove_bwd_middle: 1647; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] 1648; OPT: memmove_bwd_main_loop: 1649; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ] 1650; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 16 1651; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[BWD_MAIN_INDEX]] 1652; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(998) [[TMP10]], align 1 1653; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]] 1654; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(999) [[TMP11]], align 1 1655; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0 1656; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] 1657; OPT: memmove_copy_forward: 1658; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]] 1659; OPT: memmove_fwd_main_loop: 1660; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] 1661; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[FWD_MAIN_INDEX]] 1662; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(998) [[TMP13]], align 1 1663; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]] 1664; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(999) [[TMP14]], align 1 1665; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 16 1666; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP3]] 1667; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] 1668; OPT: memmove_fwd_middle: 1669; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]] 1670; OPT: memmove_fwd_residual_loop: 1671; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ] 1672; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]] 1673; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(998) [[TMP17]], align 1 1674; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_RESIDUAL_INDEX]] 1675; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(999) [[TMP18]], align 1 1676; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1 1677; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]] 1678; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]] 1679; OPT: memmove_done: 1680; OPT-NEXT: ret void 1681; 1682 call void @llvm.memmove.p999.p998.i64(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size, i1 false) 1683 ret void 1684} 1685 1686define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3) %dst, ptr addrspace(5) %src) { 1687; MAX1024-LABEL: @memmove_local_align1_private_align1( 1688; MAX1024-NEXT: call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i32 256, i1 false) 1689; MAX1024-NEXT: ret void 1690; 1691; ALL-LABEL: @memmove_local_align1_private_align1( 1692; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 1693; ALL: load-store-loop: 1694; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 1695; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] 1696; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]] 1697; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] 1698; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]] 1699; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 8 1700; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256 1701; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 1702; ALL: memcpy-split: 1703; ALL-NEXT: ret void 1704; 1705 call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 256, i1 false) 1706 ret void 1707} 1708 1709define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) { 1710; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size( 1711; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 1712; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] 1713; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 1714; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1715; MAX1024: loop-memcpy-expansion: 1716; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1717; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] 1718; MAX1024-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]] 1719; MAX1024-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] 1720; MAX1024-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META0]] 1721; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 1722; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] 1723; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1724; MAX1024: loop-memcpy-residual: 1725; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1726; MAX1024-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 1727; MAX1024-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]] 1728; MAX1024-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META0]] 1729; MAX1024-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]] 1730; MAX1024-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META0]] 1731; MAX1024-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1732; MAX1024-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] 1733; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1734; MAX1024: post-loop-memcpy-expansion: 1735; MAX1024-NEXT: ret void 1736; MAX1024: loop-memcpy-residual-header: 1737; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 1738; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1739; 1740; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size( 1741; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 1742; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] 1743; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 1744; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1745; ALL: loop-memcpy-expansion: 1746; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1747; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] 1748; ALL-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]] 1749; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] 1750; ALL-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META9]] 1751; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 1752; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] 1753; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1754; ALL: loop-memcpy-residual: 1755; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1756; ALL-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 1757; ALL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]] 1758; ALL-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META9]] 1759; ALL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]] 1760; ALL-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META9]] 1761; ALL-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1762; ALL-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] 1763; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1764; ALL: post-loop-memcpy-expansion: 1765; ALL-NEXT: ret void 1766; ALL: loop-memcpy-residual-header: 1767; ALL-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 1768; ALL-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1769; 1770 call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size, i1 false) 1771 ret void 1772} 1773 1774define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5) %dst, ptr addrspace(3) %src) { 1775; MAX1024-LABEL: @memmove_private_align1_local_align1( 1776; MAX1024-NEXT: call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false) 1777; MAX1024-NEXT: ret void 1778; 1779; ALL-LABEL: @memmove_private_align1_local_align1( 1780; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 1781; ALL: load-store-loop: 1782; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 1783; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] 1784; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]] 1785; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] 1786; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]] 1787; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 8 1788; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256 1789; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 1790; ALL: memcpy-split: 1791; ALL-NEXT: ret void 1792; 1793 call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 256, i1 false) 1794 ret void 1795} 1796 1797define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size) { 1798; MAX1024-LABEL: @memmove_private_align1_local_align1_unknown_size( 1799; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 1800; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] 1801; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 1802; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1803; MAX1024: loop-memcpy-expansion: 1804; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1805; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] 1806; MAX1024-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]] 1807; MAX1024-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] 1808; MAX1024-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META3]] 1809; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 1810; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] 1811; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1812; MAX1024: loop-memcpy-residual: 1813; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1814; MAX1024-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 1815; MAX1024-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] 1816; MAX1024-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope [[META3]] 1817; MAX1024-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[TMP10]] 1818; MAX1024-NEXT: store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias [[META3]] 1819; MAX1024-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1820; MAX1024-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] 1821; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1822; MAX1024: post-loop-memcpy-expansion: 1823; MAX1024-NEXT: ret void 1824; MAX1024: loop-memcpy-residual-header: 1825; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 1826; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1827; 1828; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size( 1829; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 1830; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] 1831; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 1832; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 1833; ALL: loop-memcpy-expansion: 1834; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 1835; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] 1836; ALL-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]] 1837; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] 1838; ALL-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META15]] 1839; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 1840; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] 1841; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 1842; ALL: loop-memcpy-residual: 1843; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 1844; ALL-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 1845; ALL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] 1846; ALL-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope [[META15]] 1847; ALL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[TMP10]] 1848; ALL-NEXT: store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias [[META15]] 1849; ALL-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 1850; ALL-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] 1851; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 1852; ALL: post-loop-memcpy-expansion: 1853; ALL-NEXT: ret void 1854; ALL: loop-memcpy-residual-header: 1855; ALL-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0 1856; ALL-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 1857; 1858 call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size, i1 false) 1859 ret void 1860} 1861 1862 1863define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %dst, ptr addrspace(3) %src) { 1864; MAX1024-LABEL: @memmove_flat_align1_local_align1( 1865; MAX1024-NEXT: call void @llvm.memmove.p0.p3.i32(ptr [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false) 1866; MAX1024-NEXT: ret void 1867; 1868; ALL-LABEL: @memmove_flat_align1_local_align1( 1869; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(3) 1870; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[TMP1]] 1871; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] 1872; ALL: memmove_bwd_loop: 1873; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] 1874; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 8 1875; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]] 1876; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP3]], align 1 1877; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_INDEX]] 1878; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr [[TMP4]], align 1 1879; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0 1880; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] 1881; ALL: memmove_fwd_loop: 1882; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] 1883; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]] 1884; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP6]], align 1 1885; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_INDEX]] 1886; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1 1887; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 8 1888; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256 1889; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] 1890; ALL: memmove_done: 1891; ALL-NEXT: ret void 1892; 1893 call void @llvm.memmove.p0.p3.i32(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 256, i1 false) 1894 ret void 1895} 1896 1897define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size) { 1898; OPT-LABEL: @memmove_flat_align1_local_align1_unknown_size( 1899; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 1900; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] 1901; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0 1902; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0 1903; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(3) 1904; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[TMP4]] 1905; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]] 1906; OPT: memmove_copy_backwards: 1907; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]] 1908; OPT: memmove_bwd_residual_loop: 1909; OPT-NEXT: [[TMP5:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ] 1910; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP5]], 1 1911; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]] 1912; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(3) [[TMP6]], align 1 1913; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_RESIDUAL_INDEX]] 1914; OPT-NEXT: store i8 [[ELEMENT]], ptr [[TMP7]], align 1 1915; OPT-NEXT: [[TMP8:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]] 1916; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]] 1917; OPT: memmove_bwd_middle: 1918; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] 1919; OPT: memmove_bwd_main_loop: 1920; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ] 1921; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8 1922; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]] 1923; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP10]], align 1 1924; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_MAIN_INDEX]] 1925; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1 1926; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0 1927; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] 1928; OPT: memmove_copy_forward: 1929; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]] 1930; OPT: memmove_fwd_main_loop: 1931; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] 1932; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]] 1933; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP13]], align 1 1934; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_MAIN_INDEX]] 1935; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1 1936; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8 1937; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]] 1938; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] 1939; OPT: memmove_fwd_middle: 1940; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]] 1941; OPT: memmove_fwd_residual_loop: 1942; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ] 1943; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]] 1944; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(3) [[TMP17]], align 1 1945; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_RESIDUAL_INDEX]] 1946; OPT-NEXT: store i8 [[ELEMENT3]], ptr [[TMP18]], align 1 1947; OPT-NEXT: [[TMP19]] = add i32 [[FWD_RESIDUAL_INDEX]], 1 1948; OPT-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP19]], [[SIZE]] 1949; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]] 1950; OPT: memmove_done: 1951; OPT-NEXT: ret void 1952; 1953 call void @llvm.memmove.p0.p3.i32(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size, i1 false) 1954 ret void 1955} 1956 1957define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %dst, ptr addrspace(0) %src) { 1958; MAX1024-LABEL: @memmove_local_align1_flat_align1( 1959; MAX1024-NEXT: call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) [[DST:%.*]], ptr [[SRC:%.*]], i32 256, i1 false) 1960; MAX1024-NEXT: ret void 1961; 1962; ALL-LABEL: @memmove_local_align1_flat_align1( 1963; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[DST:%.*]] to ptr 1964; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]] 1965; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] 1966; ALL: memmove_bwd_loop: 1967; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] 1968; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 8 1969; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_INDEX]] 1970; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1 1971; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]] 1972; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1 1973; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0 1974; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] 1975; ALL: memmove_fwd_loop: 1976; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] 1977; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_INDEX]] 1978; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP6]], align 1 1979; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]] 1980; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1 1981; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 8 1982; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256 1983; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] 1984; ALL: memmove_done: 1985; ALL-NEXT: ret void 1986; 1987 call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 256, i1 false) 1988 ret void 1989} 1990 1991define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size) { 1992; OPT-LABEL: @memmove_local_align1_flat_align1_unknown_size( 1993; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 1994; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] 1995; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0 1996; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0 1997; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[DST:%.*]] to ptr 1998; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP4]] 1999; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]] 2000; OPT: memmove_copy_backwards: 2001; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]] 2002; OPT: memmove_bwd_residual_loop: 2003; OPT-NEXT: [[TMP5:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ] 2004; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP5]], 1 2005; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_RESIDUAL_INDEX]] 2006; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr [[TMP6]], align 1 2007; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_RESIDUAL_INDEX]] 2008; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(3) [[TMP7]], align 1 2009; OPT-NEXT: [[TMP8:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]] 2010; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]] 2011; OPT: memmove_bwd_middle: 2012; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] 2013; OPT: memmove_bwd_main_loop: 2014; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ] 2015; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8 2016; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_MAIN_INDEX]] 2017; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP10]], align 1 2018; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]] 2019; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1 2020; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0 2021; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] 2022; OPT: memmove_copy_forward: 2023; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]] 2024; OPT: memmove_fwd_main_loop: 2025; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] 2026; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_MAIN_INDEX]] 2027; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr [[TMP13]], align 1 2028; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]] 2029; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1 2030; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8 2031; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]] 2032; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] 2033; OPT: memmove_fwd_middle: 2034; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]] 2035; OPT: memmove_fwd_residual_loop: 2036; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ] 2037; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_RESIDUAL_INDEX]] 2038; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr [[TMP17]], align 1 2039; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_RESIDUAL_INDEX]] 2040; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(3) [[TMP18]], align 1 2041; OPT-NEXT: [[TMP19]] = add i32 [[FWD_RESIDUAL_INDEX]], 1 2042; OPT-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP19]], [[SIZE]] 2043; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]] 2044; OPT: memmove_done: 2045; OPT-NEXT: ret void 2046; 2047 call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size, i1 false) 2048 ret void 2049} 2050 2051define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %dst, ptr addrspace(3) %src) { 2052; MAX1024-LABEL: @memmove_local_align1_local_align1( 2053; MAX1024-NEXT: call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false) 2054; MAX1024-NEXT: ret void 2055; 2056; ALL-LABEL: @memmove_local_align1_local_align1( 2057; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]] 2058; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] 2059; ALL: memmove_bwd_loop: 2060; ALL-NEXT: [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] 2061; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 8 2062; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]] 2063; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP2]], align 1 2064; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]] 2065; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1 2066; ALL-NEXT: [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0 2067; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] 2068; ALL: memmove_fwd_loop: 2069; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] 2070; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]] 2071; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1 2072; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]] 2073; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1 2074; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 8 2075; ALL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 256 2076; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] 2077; ALL: memmove_done: 2078; ALL-NEXT: ret void 2079; 2080 call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 256, i1 false) 2081 ret void 2082} 2083 2084define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size) { 2085; OPT-LABEL: @memmove_local_align1_local_align1_unknown_size( 2086; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 2087; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] 2088; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0 2089; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0 2090; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]] 2091; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]] 2092; OPT: memmove_copy_backwards: 2093; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]] 2094; OPT: memmove_bwd_residual_loop: 2095; OPT-NEXT: [[TMP4:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ] 2096; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP4]], 1 2097; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]] 2098; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(3) [[TMP5]], align 1 2099; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_RESIDUAL_INDEX]] 2100; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(3) [[TMP6]], align 1 2101; OPT-NEXT: [[TMP7:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]] 2102; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]] 2103; OPT: memmove_bwd_middle: 2104; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] 2105; OPT: memmove_bwd_main_loop: 2106; OPT-NEXT: [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ] 2107; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 8 2108; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]] 2109; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP9]], align 1 2110; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]] 2111; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1 2112; OPT-NEXT: [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0 2113; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] 2114; OPT: memmove_copy_forward: 2115; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]] 2116; OPT: memmove_fwd_main_loop: 2117; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] 2118; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]] 2119; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP12]], align 1 2120; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]] 2121; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1 2122; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 8 2123; OPT-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP3]] 2124; OPT-NEXT: br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] 2125; OPT: memmove_fwd_middle: 2126; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]] 2127; OPT: memmove_fwd_residual_loop: 2128; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP18:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ] 2129; OPT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]] 2130; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(3) [[TMP16]], align 1 2131; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_RESIDUAL_INDEX]] 2132; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(3) [[TMP17]], align 1 2133; OPT-NEXT: [[TMP18]] = add i32 [[FWD_RESIDUAL_INDEX]], 1 2134; OPT-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP18]], [[SIZE]] 2135; OPT-NEXT: br i1 [[TMP19]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]] 2136; OPT: memmove_done: 2137; OPT-NEXT: ret void 2138; 2139 call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size, i1 false) 2140 ret void 2141} 2142 2143define amdgpu_kernel void @memmove_private_align1_private_align1(ptr addrspace(5) %dst, ptr addrspace(5) %src) { 2144; MAX1024-LABEL: @memmove_private_align1_private_align1( 2145; MAX1024-NEXT: call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i32 256, i1 false) 2146; MAX1024-NEXT: ret void 2147; 2148; ALL-LABEL: @memmove_private_align1_private_align1( 2149; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]] 2150; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] 2151; ALL: memmove_bwd_loop: 2152; ALL-NEXT: [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] 2153; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 256 2154; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_INDEX]] 2155; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP2]], align 1 2156; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_INDEX]] 2157; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP3]], align 1 2158; ALL-NEXT: [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0 2159; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] 2160; ALL: memmove_fwd_loop: 2161; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] 2162; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_INDEX]] 2163; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP5]], align 1 2164; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_INDEX]] 2165; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP6]], align 1 2166; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 256 2167; ALL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 256 2168; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] 2169; ALL: memmove_done: 2170; ALL-NEXT: ret void 2171; 2172 call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 256, i1 false) 2173 ret void 2174} 2175 2176define amdgpu_kernel void @memmove_private_align1_private_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 %size) { 2177; OPT-LABEL: @memmove_private_align1_private_align1_unknown_size( 2178; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 2179; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] 2180; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0 2181; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0 2182; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]] 2183; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]] 2184; OPT: memmove_copy_backwards: 2185; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]] 2186; OPT: memmove_bwd_residual_loop: 2187; OPT-NEXT: [[TMP4:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ] 2188; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP4]], 1 2189; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]] 2190; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(5) [[TMP5]], align 1 2191; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_RESIDUAL_INDEX]] 2192; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(5) [[TMP6]], align 1 2193; OPT-NEXT: [[TMP7:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]] 2194; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]] 2195; OPT: memmove_bwd_middle: 2196; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] 2197; OPT: memmove_bwd_main_loop: 2198; OPT-NEXT: [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ] 2199; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 16 2200; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_MAIN_INDEX]] 2201; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP9]], align 1 2202; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_MAIN_INDEX]] 2203; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP10]], align 1 2204; OPT-NEXT: [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0 2205; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] 2206; OPT: memmove_copy_forward: 2207; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]] 2208; OPT: memmove_fwd_main_loop: 2209; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] 2210; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_MAIN_INDEX]] 2211; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP12]], align 1 2212; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_MAIN_INDEX]] 2213; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(5) [[TMP13]], align 1 2214; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 16 2215; OPT-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP3]] 2216; OPT-NEXT: br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] 2217; OPT: memmove_fwd_middle: 2218; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]] 2219; OPT: memmove_fwd_residual_loop: 2220; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP18:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ] 2221; OPT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]] 2222; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(5) [[TMP16]], align 1 2223; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_RESIDUAL_INDEX]] 2224; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(5) [[TMP17]], align 1 2225; OPT-NEXT: [[TMP18]] = add i32 [[FWD_RESIDUAL_INDEX]], 1 2226; OPT-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP18]], [[SIZE]] 2227; OPT-NEXT: br i1 [[TMP19]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]] 2228; OPT: memmove_done: 2229; OPT-NEXT: ret void 2230; 2231 call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 %size, i1 false) 2232 ret void 2233} 2234 2235define amdgpu_kernel void @memmove_global_align4_static_residual_empty(ptr addrspace(1) %dst, ptr addrspace(1) %src) { 2236; OPT-LABEL: @memmove_global_align4_static_residual_empty( 2237; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]] 2238; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] 2239; OPT: memmove_bwd_loop: 2240; OPT-NEXT: [[TMP11:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1280, [[TMP0:%.*]] ] 2241; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP11]], 256 2242; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]] 2243; OPT-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP2]], align 1 2244; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]] 2245; OPT-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1 2246; OPT-NEXT: [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 2247; OPT-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] 2248; OPT: memmove_fwd_loop: 2249; OPT-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] 2250; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]] 2251; OPT-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1 2252; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]] 2253; OPT-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1 2254; OPT-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 256 2255; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 1280 2256; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] 2257; OPT: memmove_done: 2258; OPT-NEXT: ret void 2259; 2260 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1280, i1 false) 2261 ret void 2262} 2263 2264define amdgpu_kernel void @memmove_global_align4_static_residual_full(ptr addrspace(1) %dst, ptr addrspace(1) %src) { 2265; OPT-LABEL: @memmove_global_align4_static_residual_full( 2266; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]] 2267; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] 2268; OPT: memmove_bwd_residual: 2269; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038 2270; OPT-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1 2271; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038 2272; OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1 2273; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036 2274; OPT-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(1) [[TMP4]], align 1 2275; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036 2276; OPT-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[TMP6]], align 1 2277; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032 2278; OPT-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 1 2279; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032 2280; OPT-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 1 2281; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 2282; OPT-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(1) [[TMP10]], align 1 2283; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 2284; OPT-NEXT: store i64 [[TMP11]], ptr addrspace(1) [[TMP12]], align 1 2285; OPT-NEXT: br label [[MEMMOVE_BWD_LOOP:%.*]] 2286; OPT: memmove_bwd_loop: 2287; OPT-NEXT: [[TMP13:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[MEMMOVE_BWD_RESIDUAL]] ] 2288; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP13]], 256 2289; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]] 2290; OPT-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP14]], align 1 2291; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]] 2292; OPT-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP15]], align 1 2293; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 2294; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] 2295; OPT: memmove_fwd_loop: 2296; OPT-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ] 2297; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]] 2298; OPT-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP17]], align 1 2299; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]] 2300; OPT-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP18]], align 1 2301; OPT-NEXT: [[TMP19]] = add i64 [[FWD_INDEX]], 256 2302; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 1024 2303; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]] 2304; OPT: memmove_fwd_residual: 2305; OPT-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 2306; OPT-NEXT: [[TMP22:%.*]] = load i64, ptr addrspace(1) [[TMP21]], align 1 2307; OPT-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 2308; OPT-NEXT: store i64 [[TMP22]], ptr addrspace(1) [[TMP23]], align 1 2309; OPT-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032 2310; OPT-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(1) [[TMP24]], align 1 2311; OPT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032 2312; OPT-NEXT: store i32 [[TMP25]], ptr addrspace(1) [[TMP26]], align 1 2313; OPT-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036 2314; OPT-NEXT: [[TMP28:%.*]] = load i16, ptr addrspace(1) [[TMP27]], align 1 2315; OPT-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036 2316; OPT-NEXT: store i16 [[TMP28]], ptr addrspace(1) [[TMP29]], align 1 2317; OPT-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038 2318; OPT-NEXT: [[TMP31:%.*]] = load i8, ptr addrspace(1) [[TMP30]], align 1 2319; OPT-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038 2320; OPT-NEXT: store i8 [[TMP31]], ptr addrspace(1) [[TMP32]], align 1 2321; OPT-NEXT: br label [[MEMMOVE_DONE]] 2322; OPT: memmove_done: 2323; OPT-NEXT: ret void 2324; 2325 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1039, i1 false) 2326 ret void 2327} 2328 2329define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) { 2330; OPT-LABEL: @test_umin( 2331; OPT-NEXT: entry: 2332; OPT-NEXT: [[ARRAYIDX:%.*]] = getelementptr [32 x [8 x i64]], ptr [[Y:%.*]], i64 0, i64 [[IDXPROM:%.*]] 2333; OPT-NEXT: [[SPEC_SELECT:%.*]] = tail call i64 @llvm.umin.i64(i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) inttoptr (i64 32 to ptr addrspace(4)) to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 56) 2334; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SPEC_SELECT]], 15 2335; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SPEC_SELECT]], [[TMP2]] 2336; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 2337; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] 2338; OPT: loop-memcpy-expansion: 2339; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] 2340; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i64 [[LOOP_INDEX]] 2341; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1 2342; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX]], i64 [[LOOP_INDEX]] 2343; OPT-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP7]], align 1 2344; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 2345; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] 2346; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] 2347; OPT: loop-memcpy-residual: 2348; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] 2349; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] 2350; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP10]] 2351; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP11]], align 1 2352; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX]], i64 [[TMP10]] 2353; OPT-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1 2354; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 2355; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] 2356; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] 2357; OPT: post-loop-memcpy-expansion: 2358; OPT-NEXT: ret void 2359; OPT: loop-memcpy-residual-header: 2360; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0 2361; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] 2362; 2363entry: 2364 %arrayidx = getelementptr [32 x [8 x i64]], ptr %y, i64 0, i64 %idxprom 2365 %spec.select = tail call i64 @llvm.umin.i64(i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) inttoptr (i64 32 to ptr addrspace(4)) to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 56) 2366 tail call void @llvm.memcpy.p0.p0.i64(ptr %arrayidx, ptr %x, i64 %spec.select, i1 false) 2367 ret void 2368} 2369 2370define amdgpu_kernel void @memmove_volatile(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 2371; MAX1024-LABEL: @memmove_volatile( 2372; MAX1024-NEXT: call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 512, i1 true) 2373; MAX1024-NEXT: ret void 2374; 2375; ALL-LABEL: @memmove_volatile( 2376; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]] 2377; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] 2378; ALL: memmove_bwd_loop: 2379; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 512, [[TMP0:%.*]] ] 2380; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP1]], 256 2381; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]] 2382; ALL-NEXT: [[ELEMENT:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP2]], align 1 2383; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]] 2384; ALL-NEXT: store volatile <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1 2385; ALL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0 2386; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] 2387; ALL: memmove_fwd_loop: 2388; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] 2389; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]] 2390; ALL-NEXT: [[ELEMENT1:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP5]], align 1 2391; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]] 2392; ALL-NEXT: store volatile <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1 2393; ALL-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 256 2394; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 512 2395; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] 2396; ALL: memmove_done: 2397; ALL-NEXT: ret void 2398; 2399 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 512, i1 true) 2400 ret void 2401} 2402 2403define amdgpu_kernel void @memcpy_volatile(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 { 2404; MAX1024-LABEL: @memcpy_volatile( 2405; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 512, i1 true) 2406; MAX1024-NEXT: ret void 2407; 2408; ALL-LABEL: @memcpy_volatile( 2409; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]] 2410; ALL: load-store-loop: 2411; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] 2412; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] 2413; ALL-NEXT: [[TMP2:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP1]], align 1 2414; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] 2415; ALL-NEXT: store volatile <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1 2416; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 2417; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 512 2418; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] 2419; ALL: memcpy-split: 2420; ALL-NEXT: ret void 2421; 2422 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 512, i1 true) 2423 ret void 2424} 2425 2426declare i64 @llvm.umin.i64(i64, i64) 2427 2428attributes #0 = { nounwind } 2429attributes #1 = { argmemonly nounwind } 2430