xref: /llvm-project/llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll (revision c5b0da9d83971d94d3b26105b1e42d3a3826ef1e)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -amdgpu-aa-wrapper -amdgpu-aa -instcombine -o - %s | FileCheck %s
3
4; Make sure the optimization from memcpy-from-global.ll happens, but
5; the constant source is not a global variable.
6
7target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
8
9; Simple memcpy to alloca from constant address space argument.
10define i8 @memcpy_constant_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
11; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca(
12; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
13; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
14; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1
15; CHECK-NEXT:    ret i8 [[LOAD]]
16;
17  %alloca = alloca [32 x i8], align 4, addrspace(5)
18  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 32, i1 false)
19  %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
20  %load = load i8, ptr addrspace(5) %gep
21  ret i8 %load
22}
23
24define i8 @memcpy_constant_arg_ptr_to_alloca_load_metadata(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
25; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_load_metadata(
26; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
27; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
28; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1, !noalias [[META0:![0-9]+]]
29; CHECK-NEXT:    ret i8 [[LOAD]]
30;
31  %alloca = alloca [32 x i8], align 4, addrspace(5)
32  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 32, i1 false)
33  %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
34  %load = load i8, ptr addrspace(5) %gep, !noalias !0
35  ret i8 %load
36}
37
38define i64 @memcpy_constant_arg_ptr_to_alloca_load_alignment(ptr addrspace(4) noalias readonly align 4 dereferenceable(256) %arg, i32 %idx) {
39; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_load_alignment(
40; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
41; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i64], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
42; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(4) [[GEP]], align 16
43; CHECK-NEXT:    ret i64 [[LOAD]]
44;
45  %alloca = alloca [32 x i64], align 4, addrspace(5)
46  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 256, i1 false)
47  %gep = getelementptr inbounds [32 x i64], ptr addrspace(5) %alloca, i32 0, i32 %idx
48  %load = load i64, ptr addrspace(5) %gep, align 16
49  ret i64 %load
50}
51
52define i64 @memcpy_constant_arg_ptr_to_alloca_load_atomic(ptr addrspace(4) noalias readonly align 8 dereferenceable(256) %arg, i32 %idx) {
53; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_load_atomic(
54; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i64], align 8, addrspace(5)
55; CHECK-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef align 8 dereferenceable(256) [[ALLOCA]], ptr addrspace(4) noundef align 8 dereferenceable(256) [[ARG:%.*]], i64 256, i1 false)
56; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i64], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[IDX:%.*]]
57; CHECK-NEXT:    [[LOAD:%.*]] = load atomic i64, ptr addrspace(5) [[GEP]] syncscope("somescope") acquire, align 8
58; CHECK-NEXT:    ret i64 [[LOAD]]
59;
60  %alloca = alloca [32 x i64], align 8, addrspace(5)
61  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 256, i1 false)
62  %gep = getelementptr inbounds [32 x i64], ptr addrspace(5) %alloca, i32 0, i32 %idx
63  %load = load atomic i64, ptr addrspace(5) %gep syncscope("somescope") acquire, align 8
64  ret i64 %load
65}
66
67; Simple memmove to alloca from constant address space argument.
68define i8 @memmove_constant_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
69; CHECK-LABEL: @memmove_constant_arg_ptr_to_alloca(
70; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
71; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
72; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1
73; CHECK-NEXT:    ret i8 [[LOAD]]
74;
75  %alloca = alloca [32 x i8], align 4, addrspace(5)
76  call void @llvm.memmove.p5.p4.i32(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i32 32, i1 false)
77  %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
78  %load = load i8, ptr addrspace(5) %gep
79  ret i8 %load
80}
81
82; Simple memcpy to alloca from byref constant address space argument.
83define amdgpu_kernel void @memcpy_constant_byref_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly align 4 byref([32 x i8]) %arg, ptr addrspace(1) %out, i32 %idx) {
84; CHECK-LABEL: @memcpy_constant_byref_arg_ptr_to_alloca(
85; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
86; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
87; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1
88; CHECK-NEXT:    store i8 [[LOAD]], ptr addrspace(1) [[OUT:%.*]], align 1
89; CHECK-NEXT:    ret void
90;
91  %alloca = alloca [32 x i8], align 4, addrspace(5)
92  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 32, i1 false)
93  %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
94  %load = load i8, ptr addrspace(5) %gep
95  store i8 %load, ptr addrspace(1) %out
96  ret void
97}
98
99; Simple memcpy to alloca from byref constant address space argument, but not enough bytes are dereferenceable
100define amdgpu_kernel void @memcpy_constant_byref_arg_ptr_to_alloca_too_many_bytes(ptr addrspace(4) noalias readonly align 4 byref([31 x i8]) %arg, ptr addrspace(1) %out, i32 %idx) {
101; CHECK-LABEL: @memcpy_constant_byref_arg_ptr_to_alloca_too_many_bytes(
102; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i8], align 4, addrspace(5)
103; CHECK-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef align 4 dereferenceable(31) [[ALLOCA]], ptr addrspace(4) noundef align 4 dereferenceable(31) [[ARG:%.*]], i64 31, i1 false)
104; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[IDX:%.*]]
105; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(5) [[GEP]], align 1
106; CHECK-NEXT:    store i8 [[LOAD]], ptr addrspace(1) [[OUT:%.*]], align 1
107; CHECK-NEXT:    ret void
108;
109  %alloca = alloca [32 x i8], align 4, addrspace(5)
110  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 31, i1 false)
111  %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
112  %load = load i8, ptr addrspace(5) %gep
113  store i8 %load, ptr addrspace(1) %out
114  ret void
115}
116
117; Simple memcpy to alloca from constant address space intrinsic call
118define amdgpu_kernel void @memcpy_constant_intrinsic_ptr_to_alloca(ptr addrspace(1) %out, i32 %idx) {
119; CHECK-LABEL: @memcpy_constant_intrinsic_ptr_to_alloca(
120; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i8], align 4, addrspace(5)
121; CHECK-NEXT:    [[KERNARG_SEGMENT_PTR:%.*]] = call align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
122; CHECK-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef align 4 dereferenceable(32) [[ALLOCA]], ptr addrspace(4) noundef align 16 dereferenceable(32) [[KERNARG_SEGMENT_PTR]], i64 32, i1 false)
123; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[IDX:%.*]]
124; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(5) [[GEP]], align 1
125; CHECK-NEXT:    store i8 [[LOAD]], ptr addrspace(1) [[OUT:%.*]], align 1
126; CHECK-NEXT:    ret void
127;
128  %alloca = alloca [32 x i8], align 4, addrspace(5)
129  %kernarg.segment.ptr = call dereferenceable(32) align 16 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
130  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %kernarg.segment.ptr, i64 32, i1 false)
131  %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
132  %load = load i8, ptr addrspace(5) %gep
133  store i8 %load, ptr addrspace(1) %out
134  ret void
135}
136
137; Alloca is written through a flat pointer
138define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
139; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat(
140; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
141; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
142; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1
143; CHECK-NEXT:    ret i8 [[LOAD]]
144;
145  %alloca = alloca [32 x i8], align 4, addrspace(5)
146  %alloca.cast.asc = addrspacecast ptr addrspace(5) %alloca to ptr
147  call void @llvm.memcpy.p0.p4.i64(ptr %alloca.cast.asc, ptr addrspace(4) %arg, i64 31, i1 false)
148  %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
149  %load = load i8, ptr addrspace(5) %gep
150  ret i8 %load
151}
152
153; Alloca is only addressed through flat pointer.
154define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
155; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2(
156; CHECK-NEXT:    [[ALLOCA_CAST_ASC:%.*]] = addrspacecast ptr addrspace(4) [[ARG:%.*]] to ptr
157; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
158; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr [[ALLOCA_CAST_ASC]], i64 0, i64 [[TMP1]]
159; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
160; CHECK-NEXT:    ret i8 [[LOAD]]
161;
162  %alloca = alloca [32 x i8], align 4, addrspace(5)
163  %alloca.cast.asc = addrspacecast ptr addrspace(5) %alloca to ptr
164  call void @llvm.memcpy.p0.p4.i64(ptr %alloca.cast.asc, ptr addrspace(4) %arg, i64 32, i1 false)
165  %gep = getelementptr inbounds [32 x i8], ptr %alloca.cast.asc, i32 0, i32 %idx
166  %load = load i8, ptr %gep
167  ret i8 %load
168}
169
170%struct.ty = type { [4 x i32] }
171
172define amdgpu_kernel void @byref_infloop(ptr %scratch, ptr addrspace(4) byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 {
173; CHECK-LABEL: @byref_infloop(
174; CHECK-NEXT:  bb:
175; CHECK-NEXT:    call void @llvm.memcpy.p0.p4.i32(ptr noundef nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(16) [[ARG:%.*]], i32 16, i1 false)
176; CHECK-NEXT:    ret void
177;
178bb:
179  %alloca = alloca [4 x i32], align 4, addrspace(5)
180  call void @llvm.memcpy.p5.p4.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(4) align 4 %arg, i32 16, i1 false)
181  call void @llvm.memcpy.p0.p5.i32(ptr align 4 %scratch, ptr addrspace(5) align 4 %alloca, i32 16, i1 false)
182  ret void
183}
184
185define amdgpu_kernel void @byref_infloop_metadata(ptr %scratch, ptr addrspace(4) byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 {
186; CHECK-LABEL: @byref_infloop_metadata(
187; CHECK-NEXT:  bb:
188; CHECK-NEXT:    call void @llvm.memcpy.p0.p4.i32(ptr noundef nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(16) [[ARG:%.*]], i32 16, i1 false), !noalias [[META0]]
189; CHECK-NEXT:    ret void
190;
191bb:
192  %alloca = alloca [4 x i32], align 4, addrspace(5)
193  call void @llvm.memcpy.p5.p4.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(4) align 4 %arg, i32 16, i1 false), !noalias !0
194  call void @llvm.memcpy.p0.p5.i32(ptr align 4 %scratch, ptr addrspace(5) align 4 %alloca, i32 16, i1 false), !noalias !0
195  ret void
196}
197
198define amdgpu_kernel void @byref_infloop_addrspacecast(ptr %scratch, ptr addrspace(4) byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 {
199; CHECK-LABEL: @byref_infloop_addrspacecast(
200; CHECK-NEXT:  bb:
201; CHECK-NEXT:    [[ADDRSPACECAST_ALLOCA:%.*]] = addrspacecast ptr addrspace(4) [[ARG:%.*]] to ptr
202; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], ptr noundef nonnull align 4 dereferenceable(16) [[ADDRSPACECAST_ALLOCA]], i64 16, i1 false)
203; CHECK-NEXT:    ret void
204;
205bb:
206  %alloca = alloca [4 x i32], align 4, addrspace(5)
207  %addrspacecast.alloca = addrspacecast ptr addrspace(5) %alloca to ptr
208  call void @llvm.memcpy.p0.p4.i64(ptr nonnull align 4 dereferenceable(16) %addrspacecast.alloca, ptr addrspace(4) align 4 dereferenceable(16) %arg, i64 16, i1 false)
209  call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 4 dereferenceable(16) %scratch, ptr nonnull align 4 dereferenceable(16) %addrspacecast.alloca, i64 16, i1 false)
210  ret void
211}
212
213define amdgpu_kernel void @byref_infloop_memmove(ptr %scratch, ptr addrspace(4) byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 {
214; CHECK-LABEL: @byref_infloop_memmove(
215; CHECK-NEXT:  bb:
216; CHECK-NEXT:    call void @llvm.memmove.p0.p4.i32(ptr noundef nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(16) [[ARG:%.*]], i32 16, i1 false)
217; CHECK-NEXT:    ret void
218;
219bb:
220  %alloca = alloca [4 x i32], align 4, addrspace(5)
221  call void @llvm.memmove.p5.p4.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(4) align 4 %arg, i32 16, i1 false)
222  call void @llvm.memmove.p0.p5.i32(ptr align 4 %scratch, ptr addrspace(5) align 4 %alloca, i32 16, i1 false)
223  ret void
224}
225
226declare void @llvm.memcpy.p0.p5.i32(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i32, i1 immarg) #0
227declare void @llvm.memcpy.p5.p4.i32(ptr addrspace(5) nocapture, ptr addrspace(4) nocapture, i32, i1) #0
228declare void @llvm.memcpy.p0.p4.i64(ptr nocapture, ptr addrspace(4) nocapture, i64, i1) #0
229declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #0
230declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) nocapture, ptr addrspace(4) nocapture, i64, i1) #0
231declare void @llvm.memmove.p5.p4.i32(ptr addrspace(5) nocapture, ptr addrspace(4) nocapture, i32, i1) #0
232declare void @llvm.memmove.p0.p5.i32(ptr nocapture, ptr addrspace(5) nocapture, i32, i1) #0
233declare ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #1
234
235attributes #0 = { argmemonly nounwind willreturn }
236attributes #1 = { nounwind readnone speculatable }
237
238!0 = !{!1}
239!1 = !{!1, !2}
240!2 = !{!2}
241