1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -amdgpu-aa-wrapper -amdgpu-aa -instcombine -o - %s | FileCheck %s 3 4; Make sure the optimization from memcpy-from-global.ll happens, but 5; the constant source is not a global variable. 6 7target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" 8 9; Simple memcpy to alloca from constant address space argument. 10define i8 @memcpy_constant_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) { 11; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca( 12; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 13; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]] 14; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1 15; CHECK-NEXT: ret i8 [[LOAD]] 16; 17 %alloca = alloca [32 x i8], align 4, addrspace(5) 18 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 32, i1 false) 19 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx 20 %load = load i8, ptr addrspace(5) %gep 21 ret i8 %load 22} 23 24define i8 @memcpy_constant_arg_ptr_to_alloca_load_metadata(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) { 25; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_load_metadata( 26; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 27; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]] 28; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1, !noalias [[META0:![0-9]+]] 29; CHECK-NEXT: ret i8 [[LOAD]] 30; 31 %alloca = alloca [32 x i8], align 4, addrspace(5) 32 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 32, i1 false) 33 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx 34 %load = load i8, ptr addrspace(5) %gep, !noalias !0 35 ret i8 %load 36} 37 38define i64 @memcpy_constant_arg_ptr_to_alloca_load_alignment(ptr addrspace(4) noalias readonly align 4 dereferenceable(256) %arg, i32 %idx) { 39; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_load_alignment( 40; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 41; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i64], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]] 42; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[GEP]], align 16 43; CHECK-NEXT: ret i64 [[LOAD]] 44; 45 %alloca = alloca [32 x i64], align 4, addrspace(5) 46 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 256, i1 false) 47 %gep = getelementptr inbounds [32 x i64], ptr addrspace(5) %alloca, i32 0, i32 %idx 48 %load = load i64, ptr addrspace(5) %gep, align 16 49 ret i64 %load 50} 51 52define i64 @memcpy_constant_arg_ptr_to_alloca_load_atomic(ptr addrspace(4) noalias readonly align 8 dereferenceable(256) %arg, i32 %idx) { 53; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_load_atomic( 54; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i64], align 8, addrspace(5) 55; CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef align 8 dereferenceable(256) [[ALLOCA]], ptr addrspace(4) noundef align 8 dereferenceable(256) [[ARG:%.*]], i64 256, i1 false) 56; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i64], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[IDX:%.*]] 57; CHECK-NEXT: [[LOAD:%.*]] = load atomic i64, ptr addrspace(5) [[GEP]] syncscope("somescope") acquire, align 8 58; CHECK-NEXT: ret i64 [[LOAD]] 59; 60 %alloca = alloca [32 x i64], align 8, addrspace(5) 61 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 256, i1 false) 62 %gep = getelementptr inbounds [32 x i64], ptr addrspace(5) %alloca, i32 0, i32 %idx 63 %load = load atomic i64, ptr addrspace(5) %gep syncscope("somescope") acquire, align 8 64 ret i64 %load 65} 66 67; Simple memmove to alloca from constant address space argument. 68define i8 @memmove_constant_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) { 69; CHECK-LABEL: @memmove_constant_arg_ptr_to_alloca( 70; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 71; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]] 72; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1 73; CHECK-NEXT: ret i8 [[LOAD]] 74; 75 %alloca = alloca [32 x i8], align 4, addrspace(5) 76 call void @llvm.memmove.p5.p4.i32(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i32 32, i1 false) 77 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx 78 %load = load i8, ptr addrspace(5) %gep 79 ret i8 %load 80} 81 82; Simple memcpy to alloca from byref constant address space argument. 83define amdgpu_kernel void @memcpy_constant_byref_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly align 4 byref([32 x i8]) %arg, ptr addrspace(1) %out, i32 %idx) { 84; CHECK-LABEL: @memcpy_constant_byref_arg_ptr_to_alloca( 85; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 86; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]] 87; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1 88; CHECK-NEXT: store i8 [[LOAD]], ptr addrspace(1) [[OUT:%.*]], align 1 89; CHECK-NEXT: ret void 90; 91 %alloca = alloca [32 x i8], align 4, addrspace(5) 92 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 32, i1 false) 93 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx 94 %load = load i8, ptr addrspace(5) %gep 95 store i8 %load, ptr addrspace(1) %out 96 ret void 97} 98 99; Simple memcpy to alloca from byref constant address space argument, but not enough bytes are dereferenceable 100define amdgpu_kernel void @memcpy_constant_byref_arg_ptr_to_alloca_too_many_bytes(ptr addrspace(4) noalias readonly align 4 byref([31 x i8]) %arg, ptr addrspace(1) %out, i32 %idx) { 101; CHECK-LABEL: @memcpy_constant_byref_arg_ptr_to_alloca_too_many_bytes( 102; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 4, addrspace(5) 103; CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef align 4 dereferenceable(31) [[ALLOCA]], ptr addrspace(4) noundef align 4 dereferenceable(31) [[ARG:%.*]], i64 31, i1 false) 104; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[IDX:%.*]] 105; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(5) [[GEP]], align 1 106; CHECK-NEXT: store i8 [[LOAD]], ptr addrspace(1) [[OUT:%.*]], align 1 107; CHECK-NEXT: ret void 108; 109 %alloca = alloca [32 x i8], align 4, addrspace(5) 110 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 31, i1 false) 111 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx 112 %load = load i8, ptr addrspace(5) %gep 113 store i8 %load, ptr addrspace(1) %out 114 ret void 115} 116 117; Simple memcpy to alloca from constant address space intrinsic call 118define amdgpu_kernel void @memcpy_constant_intrinsic_ptr_to_alloca(ptr addrspace(1) %out, i32 %idx) { 119; CHECK-LABEL: @memcpy_constant_intrinsic_ptr_to_alloca( 120; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 4, addrspace(5) 121; CHECK-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 122; CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef align 4 dereferenceable(32) [[ALLOCA]], ptr addrspace(4) noundef align 16 dereferenceable(32) [[KERNARG_SEGMENT_PTR]], i64 32, i1 false) 123; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[IDX:%.*]] 124; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(5) [[GEP]], align 1 125; CHECK-NEXT: store i8 [[LOAD]], ptr addrspace(1) [[OUT:%.*]], align 1 126; CHECK-NEXT: ret void 127; 128 %alloca = alloca [32 x i8], align 4, addrspace(5) 129 %kernarg.segment.ptr = call dereferenceable(32) align 16 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 130 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %kernarg.segment.ptr, i64 32, i1 false) 131 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx 132 %load = load i8, ptr addrspace(5) %gep 133 store i8 %load, ptr addrspace(1) %out 134 ret void 135} 136 137; Alloca is written through a flat pointer 138define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) { 139; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat( 140; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 141; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]] 142; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1 143; CHECK-NEXT: ret i8 [[LOAD]] 144; 145 %alloca = alloca [32 x i8], align 4, addrspace(5) 146 %alloca.cast.asc = addrspacecast ptr addrspace(5) %alloca to ptr 147 call void @llvm.memcpy.p0.p4.i64(ptr %alloca.cast.asc, ptr addrspace(4) %arg, i64 31, i1 false) 148 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx 149 %load = load i8, ptr addrspace(5) %gep 150 ret i8 %load 151} 152 153; Alloca is only addressed through flat pointer. 154define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) { 155; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2( 156; CHECK-NEXT: [[ALLOCA_CAST_ASC:%.*]] = addrspacecast ptr addrspace(4) [[ARG:%.*]] to ptr 157; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 158; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr [[ALLOCA_CAST_ASC]], i64 0, i64 [[TMP1]] 159; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1 160; CHECK-NEXT: ret i8 [[LOAD]] 161; 162 %alloca = alloca [32 x i8], align 4, addrspace(5) 163 %alloca.cast.asc = addrspacecast ptr addrspace(5) %alloca to ptr 164 call void @llvm.memcpy.p0.p4.i64(ptr %alloca.cast.asc, ptr addrspace(4) %arg, i64 32, i1 false) 165 %gep = getelementptr inbounds [32 x i8], ptr %alloca.cast.asc, i32 0, i32 %idx 166 %load = load i8, ptr %gep 167 ret i8 %load 168} 169 170%struct.ty = type { [4 x i32] } 171 172define amdgpu_kernel void @byref_infloop(ptr %scratch, ptr addrspace(4) byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 { 173; CHECK-LABEL: @byref_infloop( 174; CHECK-NEXT: bb: 175; CHECK-NEXT: call void @llvm.memcpy.p0.p4.i32(ptr noundef nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(16) [[ARG:%.*]], i32 16, i1 false) 176; CHECK-NEXT: ret void 177; 178bb: 179 %alloca = alloca [4 x i32], align 4, addrspace(5) 180 call void @llvm.memcpy.p5.p4.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(4) align 4 %arg, i32 16, i1 false) 181 call void @llvm.memcpy.p0.p5.i32(ptr align 4 %scratch, ptr addrspace(5) align 4 %alloca, i32 16, i1 false) 182 ret void 183} 184 185define amdgpu_kernel void @byref_infloop_metadata(ptr %scratch, ptr addrspace(4) byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 { 186; CHECK-LABEL: @byref_infloop_metadata( 187; CHECK-NEXT: bb: 188; CHECK-NEXT: call void @llvm.memcpy.p0.p4.i32(ptr noundef nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(16) [[ARG:%.*]], i32 16, i1 false), !noalias [[META0]] 189; CHECK-NEXT: ret void 190; 191bb: 192 %alloca = alloca [4 x i32], align 4, addrspace(5) 193 call void @llvm.memcpy.p5.p4.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(4) align 4 %arg, i32 16, i1 false), !noalias !0 194 call void @llvm.memcpy.p0.p5.i32(ptr align 4 %scratch, ptr addrspace(5) align 4 %alloca, i32 16, i1 false), !noalias !0 195 ret void 196} 197 198define amdgpu_kernel void @byref_infloop_addrspacecast(ptr %scratch, ptr addrspace(4) byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 { 199; CHECK-LABEL: @byref_infloop_addrspacecast( 200; CHECK-NEXT: bb: 201; CHECK-NEXT: [[ADDRSPACECAST_ALLOCA:%.*]] = addrspacecast ptr addrspace(4) [[ARG:%.*]] to ptr 202; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], ptr noundef nonnull align 4 dereferenceable(16) [[ADDRSPACECAST_ALLOCA]], i64 16, i1 false) 203; CHECK-NEXT: ret void 204; 205bb: 206 %alloca = alloca [4 x i32], align 4, addrspace(5) 207 %addrspacecast.alloca = addrspacecast ptr addrspace(5) %alloca to ptr 208 call void @llvm.memcpy.p0.p4.i64(ptr nonnull align 4 dereferenceable(16) %addrspacecast.alloca, ptr addrspace(4) align 4 dereferenceable(16) %arg, i64 16, i1 false) 209 call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 4 dereferenceable(16) %scratch, ptr nonnull align 4 dereferenceable(16) %addrspacecast.alloca, i64 16, i1 false) 210 ret void 211} 212 213define amdgpu_kernel void @byref_infloop_memmove(ptr %scratch, ptr addrspace(4) byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 { 214; CHECK-LABEL: @byref_infloop_memmove( 215; CHECK-NEXT: bb: 216; CHECK-NEXT: call void @llvm.memmove.p0.p4.i32(ptr noundef nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(16) [[ARG:%.*]], i32 16, i1 false) 217; CHECK-NEXT: ret void 218; 219bb: 220 %alloca = alloca [4 x i32], align 4, addrspace(5) 221 call void @llvm.memmove.p5.p4.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(4) align 4 %arg, i32 16, i1 false) 222 call void @llvm.memmove.p0.p5.i32(ptr align 4 %scratch, ptr addrspace(5) align 4 %alloca, i32 16, i1 false) 223 ret void 224} 225 226declare void @llvm.memcpy.p0.p5.i32(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i32, i1 immarg) #0 227declare void @llvm.memcpy.p5.p4.i32(ptr addrspace(5) nocapture, ptr addrspace(4) nocapture, i32, i1) #0 228declare void @llvm.memcpy.p0.p4.i64(ptr nocapture, ptr addrspace(4) nocapture, i64, i1) #0 229declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #0 230declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) nocapture, ptr addrspace(4) nocapture, i64, i1) #0 231declare void @llvm.memmove.p5.p4.i32(ptr addrspace(5) nocapture, ptr addrspace(4) nocapture, i32, i1) #0 232declare void @llvm.memmove.p0.p5.i32(ptr nocapture, ptr addrspace(5) nocapture, i32, i1) #0 233declare ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #1 234 235attributes #0 = { argmemonly nounwind willreturn } 236attributes #1 = { nounwind readnone speculatable } 237 238!0 = !{!1} 239!1 = !{!1, !2} 240!2 = !{!2} 241