xref: /llvm-project/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll (revision 044aba4bd7575f4c6224efe81f754848eb199dd4)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | FileCheck %s
3; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -passes=amdgpu-promote-kernel-arguments,infer-address-spaces | FileCheck %s
4; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=GCN %s
5
6; GCN-LABEL: ptr_nest_3:
7; GCN-COUNT-2: global_load_dwordx2
8; GCN:         global_store_dword
9define amdgpu_kernel void @ptr_nest_3(ptr addrspace(1) nocapture readonly %Arg) {
10; CHECK-LABEL: @ptr_nest_3(
11; CHECK-NEXT:  entry:
12; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
13; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i32 [[I]]
14; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber [[META0:![0-9]+]]
15; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
16; CHECK-NEXT:    [[P3:%.*]] = load ptr, ptr addrspace(1) [[P2_GLOBAL]], align 8, !amdgpu.noclobber [[META0]]
17; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
18; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4
19; CHECK-NEXT:    ret void
20;
21entry:
22  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
23  %p1 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i32 %i
24  %p2 = load ptr, ptr addrspace(1) %p1, align 8
25  %p3 = load ptr, ptr %p2, align 8
26  store float 0.000000e+00, ptr %p3, align 4
27  ret void
28}
29
30; GCN-LABEL: ptr_bitcast:
31; GCN: global_load_dwordx2
32; GCN: global_store_dword
33define amdgpu_kernel void @ptr_bitcast(ptr nocapture readonly %Arg) {
34; CHECK-LABEL: @ptr_bitcast(
35; CHECK-NEXT:  entry:
36; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
37; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
38; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I]]
39; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber [[META0]]
40; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
41; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[P2_GLOBAL]], align 4
42; CHECK-NEXT:    ret void
43;
44entry:
45  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
46  %p1 = getelementptr inbounds ptr, ptr %Arg, i32 %i
47  %p2 = load ptr, ptr %p1, align 8
48  store i32 0, ptr %p2, align 4
49  ret void
50}
51
52%struct.S = type { ptr }
53
54; GCN-LABEL: ptr_in_struct:
55; GCN: s_load_dwordx2
56; GCN: global_store_dword
57define amdgpu_kernel void @ptr_in_struct(ptr addrspace(1) nocapture readonly %Arg) {
58; CHECK-LABEL: @ptr_in_struct(
59; CHECK-NEXT:  entry:
60; CHECK-NEXT:    [[P1:%.*]] = load ptr, ptr addrspace(1) [[ARG:%.*]], align 8, !amdgpu.noclobber [[META0]]
61; CHECK-NEXT:    [[P1_GLOBAL:%.*]] = addrspacecast ptr [[P1]] to ptr addrspace(1)
62; CHECK-NEXT:    [[ID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
63; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[P1_GLOBAL]], i32 [[ID]]
64; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[ARRAYIDX]], align 4
65; CHECK-NEXT:    ret void
66;
67entry:
68  %p1 = load ptr, ptr addrspace(1) %Arg, align 8
69  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
70  %arrayidx = getelementptr inbounds float, ptr %p1, i32 %id
71  store float 0.000000e+00, ptr %arrayidx, align 4
72  ret void
73}
74
75@LDS = internal unnamed_addr addrspace(3) global [4 x float] undef, align 16
76
77; GCN-LABEL: flat_ptr_arg:
78; GCN-COUNT-2: global_load_dwordx2
79; GCN:         global_load_dwordx4
80; GCN:         global_store_dword
81define amdgpu_kernel void @flat_ptr_arg(ptr nocapture readonly noalias %Arg, ptr nocapture noalias %Out, i32 %X) {
82; CHECK-LABEL: @flat_ptr_arg(
83; CHECK-NEXT:  entry:
84; CHECK-NEXT:    [[OUT_GLOBAL:%.*]] = addrspacecast ptr [[OUT:%.*]] to ptr addrspace(1)
85; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
86; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
87; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
88; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i64 [[IDXPROM]]
89; CHECK-NEXT:    [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8, !amdgpu.noclobber [[META0]]
90; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast ptr [[I1]] to ptr addrspace(1)
91; CHECK-NEXT:    [[I2:%.*]] = load float, ptr addrspace(1) [[I1_GLOBAL]], align 4, !amdgpu.noclobber [[META0]]
92; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X:%.*]]
93; CHECK-NEXT:    store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4
94; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 1
95; CHECK-NEXT:    [[I3:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_1]], align 4
96; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[X]], 1
97; CHECK-NEXT:    [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_1]]
98; CHECK-NEXT:    store float [[I3]], ptr addrspace(3) [[ARRAYIDX512_1]], align 4
99; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 2
100; CHECK-NEXT:    [[I4:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_2]], align 4
101; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[X]], 2
102; CHECK-NEXT:    [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_2]]
103; CHECK-NEXT:    store float [[I4]], ptr addrspace(3) [[ARRAYIDX512_2]], align 4
104; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 3
105; CHECK-NEXT:    [[I5:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_3]], align 4
106; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[X]], 3
107; CHECK-NEXT:    [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_3]]
108; CHECK-NEXT:    store float [[I5]], ptr addrspace(3) [[ARRAYIDX512_3]], align 4
109; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
110; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]]
111; CHECK-NEXT:    [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4
112; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[OUT_GLOBAL]], i64 [[IDXPROM]]
113; CHECK-NEXT:    [[I7:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX11]], align 8, !amdgpu.noclobber [[META0]]
114; CHECK-NEXT:    [[I7_GLOBAL:%.*]] = addrspacecast ptr [[I7]] to ptr addrspace(1)
115; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
116; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I7_GLOBAL]], i64 [[IDXPROM8]]
117; CHECK-NEXT:    store float [[I6]], ptr addrspace(1) [[ARRAYIDX9]], align 4
118; CHECK-NEXT:    ret void
119;
120entry:
121  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
122  %idxprom = zext i32 %i to i64
123  %arrayidx10 = getelementptr inbounds ptr, ptr %Arg, i64 %idxprom
124  %i1 = load ptr, ptr %arrayidx10, align 8
125  %i2 = load float, ptr %i1, align 4
126  %arrayidx512 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %X
127  store float %i2, ptr addrspace(3) %arrayidx512, align 4
128  %arrayidx3.1 = getelementptr inbounds float, ptr %i1, i64 1
129  %i3 = load float, ptr %arrayidx3.1, align 4
130  %add.1 = add nsw i32 %X, 1
131  %arrayidx512.1 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.1
132  store float %i3, ptr addrspace(3) %arrayidx512.1, align 4
133  %arrayidx3.2 = getelementptr inbounds float, ptr %i1, i64 2
134  %i4 = load float, ptr %arrayidx3.2, align 4
135  %add.2 = add nsw i32 %X, 2
136  %arrayidx512.2 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.2
137  store float %i4, ptr addrspace(3) %arrayidx512.2, align 4
138  %arrayidx3.3 = getelementptr inbounds float, ptr %i1, i64 3
139  %i5 = load float, ptr %arrayidx3.3, align 4
140  %add.3 = add nsw i32 %X, 3
141  %arrayidx512.3 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.3
142  store float %i5, ptr addrspace(3) %arrayidx512.3, align 4
143  %sub = add nsw i32 %X, -1
144  %arrayidx711 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %sub
145  %i6 = load float, ptr addrspace(3) %arrayidx711, align 4
146  %arrayidx11 = getelementptr inbounds ptr, ptr %Out, i64 %idxprom
147  %i7 = load ptr, ptr %arrayidx11, align 8
148  %idxprom8 = sext i32 %X to i64
149  %arrayidx9 = getelementptr inbounds float, ptr %i7, i64 %idxprom8
150  store float %i6, ptr %arrayidx9, align 4
151  ret void
152}
153
154; GCN-LABEL: global_ptr_arg:
155; GCN: global_load_dwordx2
156; GCN: global_load_dwordx4
157; GCN: global_store_dword
158define amdgpu_kernel void @global_ptr_arg(ptr addrspace(1) nocapture readonly %Arg, i32 %X) {
159; CHECK-LABEL: @global_ptr_arg(
160; CHECK-NEXT:  entry:
161; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
162; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
163; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i64 [[IDXPROM]]
164; CHECK-NEXT:    [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8, !amdgpu.noclobber [[META0]]
165; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast ptr [[I1]] to ptr addrspace(1)
166; CHECK-NEXT:    [[I2:%.*]] = load float, ptr addrspace(1) [[I1_GLOBAL]], align 4, !amdgpu.noclobber [[META0]]
167; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X:%.*]]
168; CHECK-NEXT:    store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4
169; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 1
170; CHECK-NEXT:    [[I3:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_1]], align 4
171; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[X]], 1
172; CHECK-NEXT:    [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_1]]
173; CHECK-NEXT:    store float [[I3]], ptr addrspace(3) [[ARRAYIDX512_1]], align 4
174; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 2
175; CHECK-NEXT:    [[I4:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_2]], align 4
176; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[X]], 2
177; CHECK-NEXT:    [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_2]]
178; CHECK-NEXT:    store float [[I4]], ptr addrspace(3) [[ARRAYIDX512_2]], align 4
179; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 3
180; CHECK-NEXT:    [[I5:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_3]], align 4
181; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[X]], 3
182; CHECK-NEXT:    [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_3]]
183; CHECK-NEXT:    store float [[I5]], ptr addrspace(3) [[ARRAYIDX512_3]], align 4
184; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
185; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]]
186; CHECK-NEXT:    [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4
187; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
188; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 [[IDXPROM8]]
189; CHECK-NEXT:    store float [[I6]], ptr addrspace(1) [[ARRAYIDX9]], align 4
190; CHECK-NEXT:    ret void
191;
192entry:
193  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
194  %idxprom = zext i32 %i to i64
195  %arrayidx10 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i64 %idxprom
196  %i1 = load ptr, ptr addrspace(1) %arrayidx10, align 8
197  %i2 = load float, ptr %i1, align 4
198  %arrayidx512 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %X
199  store float %i2, ptr addrspace(3) %arrayidx512, align 4
200  %arrayidx3.1 = getelementptr inbounds float, ptr %i1, i64 1
201  %i3 = load float, ptr %arrayidx3.1, align 4
202  %add.1 = add nsw i32 %X, 1
203  %arrayidx512.1 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.1
204  store float %i3, ptr addrspace(3) %arrayidx512.1, align 4
205  %arrayidx3.2 = getelementptr inbounds float, ptr %i1, i64 2
206  %i4 = load float, ptr %arrayidx3.2, align 4
207  %add.2 = add nsw i32 %X, 2
208  %arrayidx512.2 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.2
209  store float %i4, ptr addrspace(3) %arrayidx512.2, align 4
210  %arrayidx3.3 = getelementptr inbounds float, ptr %i1, i64 3
211  %i5 = load float, ptr %arrayidx3.3, align 4
212  %add.3 = add nsw i32 %X, 3
213  %arrayidx512.3 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.3
214  store float %i5, ptr addrspace(3) %arrayidx512.3, align 4
215  %sub = add nsw i32 %X, -1
216  %arrayidx711 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %sub
217  %i6 = load float, ptr addrspace(3) %arrayidx711, align 4
218  %idxprom8 = sext i32 %X to i64
219  %arrayidx9 = getelementptr inbounds float, ptr %i1, i64 %idxprom8
220  store float %i6, ptr %arrayidx9, align 4
221  ret void
222}
223
224; GCN-LABEL: global_ptr_arg_clobbered:
225; GCN: global_store_dwordx2
226; GCN: global_load_dwordx2
227; GCN: flat_load_dword
228; GCN: flat_store_dword
229define amdgpu_kernel void @global_ptr_arg_clobbered(ptr addrspace(1) nocapture readonly %Arg, i32 %X) {
230; CHECK-LABEL: @global_ptr_arg_clobbered(
231; CHECK-NEXT:  entry:
232; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
233; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
234; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i64 [[IDXPROM]]
235; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARRAYIDX10]], i32 [[X:%.*]]
236; CHECK-NEXT:    store ptr null, ptr addrspace(1) [[ARRAYIDX11]], align 4
237; CHECK-NEXT:    [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8
238; CHECK-NEXT:    [[I2:%.*]] = load float, ptr [[I1]], align 4
239; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X]]
240; CHECK-NEXT:    store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4
241; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
242; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]]
243; CHECK-NEXT:    [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4
244; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
245; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[I1]], i64 [[IDXPROM8]]
246; CHECK-NEXT:    store float [[I6]], ptr [[ARRAYIDX9]], align 4
247; CHECK-NEXT:    ret void
248;
249entry:
250  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
251  %idxprom = zext i32 %i to i64
252  %arrayidx10 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i64 %idxprom
253  %arrayidx11 = getelementptr inbounds ptr, ptr addrspace(1) %arrayidx10, i32 %X
254  store ptr null, ptr addrspace(1) %arrayidx11, align 4
255  %i1 = load ptr, ptr addrspace(1) %arrayidx10, align 8
256  %i2 = load float, ptr %i1, align 4
257  %arrayidx512 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %X
258  store float %i2, ptr addrspace(3) %arrayidx512, align 4
259  %sub = add nsw i32 %X, -1
260  %arrayidx711 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %sub
261  %i6 = load float, ptr addrspace(3) %arrayidx711, align 4
262  %idxprom8 = sext i32 %X to i64
263  %arrayidx9 = getelementptr inbounds float, ptr %i1, i64 %idxprom8
264  store float %i6, ptr %arrayidx9, align 4
265  ret void
266}
267
268; GCN-LABEL: global_ptr_arg_clobbered_after_load:
269; GCN: global_load_dwordx2
270; GCN: global_store_dwordx2
271; GCN: global_load_dword
272; GCN: global_store_dword
273define amdgpu_kernel void @global_ptr_arg_clobbered_after_load(ptr addrspace(1) nocapture readonly %Arg, i32 %X) {
274; CHECK-LABEL: @global_ptr_arg_clobbered_after_load(
275; CHECK-NEXT:  entry:
276; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
277; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
278; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i64 [[IDXPROM]]
279; CHECK-NEXT:    [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8, !amdgpu.noclobber [[META0]]
280; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast ptr [[I1]] to ptr addrspace(1)
281; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARRAYIDX10]], i32 [[X:%.*]]
282; CHECK-NEXT:    store ptr null, ptr addrspace(1) [[ARRAYIDX11]], align 4
283; CHECK-NEXT:    [[I2:%.*]] = load float, ptr addrspace(1) [[I1_GLOBAL]], align 4
284; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X]]
285; CHECK-NEXT:    store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4
286; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
287; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]]
288; CHECK-NEXT:    [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4
289; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
290; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 [[IDXPROM8]]
291; CHECK-NEXT:    store float [[I6]], ptr addrspace(1) [[ARRAYIDX9]], align 4
292; CHECK-NEXT:    ret void
293;
294entry:
295  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
296  %idxprom = zext i32 %i to i64
297  %arrayidx10 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i64 %idxprom
298  %i1 = load ptr, ptr addrspace(1) %arrayidx10, align 8
299  %arrayidx11 = getelementptr inbounds ptr, ptr addrspace(1) %arrayidx10, i32 %X
300  store ptr null, ptr addrspace(1) %arrayidx11, align 4
301  %i2 = load float, ptr %i1, align 4
302  %arrayidx512 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %X
303  store float %i2, ptr addrspace(3) %arrayidx512, align 4
304  %sub = add nsw i32 %X, -1
305  %arrayidx711 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %sub
306  %i6 = load float, ptr addrspace(3) %arrayidx711, align 4
307  %idxprom8 = sext i32 %X to i64
308  %arrayidx9 = getelementptr inbounds float, ptr %i1, i64 %idxprom8
309  store float %i6, ptr %arrayidx9, align 4
310  ret void
311}
312
313; GCN-LABEL: ptr_nest_3_barrier:
314; GCN-COUNT-2: global_load_dwordx2
315; GCN:         global_store_dword
316define amdgpu_kernel void @ptr_nest_3_barrier(ptr addrspace(1) nocapture readonly %Arg) {
317; CHECK-LABEL: @ptr_nest_3_barrier(
318; CHECK-NEXT:  entry:
319; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
320; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i32 [[I]]
321; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
322; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber [[META0]]
323; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
324; CHECK-NEXT:    [[P3:%.*]] = load ptr, ptr addrspace(1) [[P2_GLOBAL]], align 8, !amdgpu.noclobber [[META0]]
325; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
326; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4
327; CHECK-NEXT:    ret void
328;
329entry:
330  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
331  %p1 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i32 %i
332  tail call void @llvm.amdgcn.s.barrier()
333  %p2 = load ptr, ptr addrspace(1) %p1, align 8
334  %p3 = load ptr, ptr %p2, align 8
335  store float 0.000000e+00, ptr %p3, align 4
336  ret void
337}
338
339; GCN-LABEL: flat_ptr_nest_2:
340; GCN: s_lshl_b64
341; GCN: s_load_dwordx2
342; GCN: global_store_dword
343define amdgpu_kernel void @flat_ptr_nest_2(ptr nocapture readonly %Arg, i32 %i) {
344; CHECK-LABEL: @flat_ptr_nest_2(
345; CHECK-NEXT:  entry:
346; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
347; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I:%.*]]
348; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber [[META0]]
349; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
350; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P2_GLOBAL]], align 4
351; CHECK-NEXT:    ret void
352;
353entry:
354  %p1 = getelementptr inbounds ptr, ptr %Arg, i32 %i
355  %p2 = load ptr, ptr %p1, align 8
356  store float 0.000000e+00, ptr %p2, align 4
357  ret void
358}
359
360; GCN-LABEL: const_ptr_nest_3:
361; GCN: s_lshl_b64
362; GCN: s_load_dwordx2
363; GCN: s_load_dwordx2
364; GCN: global_store_dword
365define amdgpu_kernel void @const_ptr_nest_3(ptr addrspace(4) nocapture readonly %Arg, i32 %i) {
366; CHECK-LABEL: @const_ptr_nest_3(
367; CHECK-NEXT:  entry:
368; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[ARG:%.*]], i32 [[I:%.*]]
369; CHECK-NEXT:    [[P2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[P1]], align 8, !amdgpu.noclobber [[META0]]
370; CHECK-NEXT:    [[P3:%.*]] = load ptr, ptr addrspace(4) [[P2]], align 8, !amdgpu.noclobber [[META0]]
371; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
372; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[TMP0]], align 4
373; CHECK-NEXT:    ret void
374;
375entry:
376  %p1 = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) %Arg, i32 %i
377  %p2 = load ptr addrspace(4), ptr addrspace(4) %p1, align 8
378  %p3 = load ptr, ptr addrspace(4) %p2, align 8
379  store float 0.000000e+00, ptr %p3, align 4
380  ret void
381}
382
383; GCN-LABEL: cast_from_const_const_ptr_nest_3:
384; GCN: s_lshl_b64
385; GCN: s_load_dwordx2
386; GCN: s_load_dwordx2
387; GCN: global_store_dword
388define amdgpu_kernel void @cast_from_const_const_ptr_nest_3(ptr addrspace(4) nocapture readonly %Arg, i32 %i) {
389; CHECK-LABEL: @cast_from_const_const_ptr_nest_3(
390; CHECK-NEXT:  entry:
391; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[ARG:%.*]], i32 [[I:%.*]]
392; CHECK-NEXT:    [[P2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[P1]], align 8, !amdgpu.noclobber [[META0]]
393; CHECK-NEXT:    [[P3:%.*]] = load ptr, ptr addrspace(4) [[P2]], align 8, !amdgpu.noclobber [[META0]]
394; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
395; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4
396; CHECK-NEXT:    ret void
397;
398entry:
399  %p1 = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) %Arg, i32 %i
400  %a1 = addrspacecast ptr addrspace(4) %p1 to ptr
401  %p2 = load ptr addrspace(4), ptr %a1, align 8
402  %a2 = addrspacecast ptr addrspace(4) %p2 to ptr
403  %p3 = load ptr, ptr %a2, align 8
404  store float 0.000000e+00, ptr %p3, align 4
405  ret void
406}
407
408; GCN-LABEL: flat_ptr_volatile_load:
409; GCN: s_lshl_b64
410; GCN: flat_load_dwordx2
411; GCN: global_store_dword
412define amdgpu_kernel void @flat_ptr_volatile_load(ptr nocapture readonly %Arg, i32 %i) {
413; CHECK-LABEL: @flat_ptr_volatile_load(
414; CHECK-NEXT:  entry:
415; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
416; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I:%.*]]
417; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
418; CHECK-NEXT:    [[P2:%.*]] = load volatile ptr, ptr [[TMP0]], align 8
419; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
420; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P2_GLOBAL]], align 4
421; CHECK-NEXT:    ret void
422;
423entry:
424  %p1 = getelementptr inbounds ptr, ptr %Arg, i32 %i
425  %p2 = load volatile ptr, ptr %p1, align 8
426  store float 0.000000e+00, ptr %p2, align 4
427  ret void
428}
429
430; GCN-LABEL: flat_ptr_atomic_load:
431; GCN: s_lshl_b64
432; GCN: global_load_dwordx2
433; GCN: global_store_dword
434define amdgpu_kernel void @flat_ptr_atomic_load(ptr nocapture readonly %Arg, i32 %i) {
435; CHECK-LABEL: @flat_ptr_atomic_load(
436; CHECK-NEXT:  entry:
437; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
438; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I:%.*]]
439; CHECK-NEXT:    [[P2:%.*]] = load atomic ptr, ptr addrspace(1) [[P1]] monotonic, align 8
440; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
441; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P2_GLOBAL]], align 4
442; CHECK-NEXT:    ret void
443;
444entry:
445  %p1 = getelementptr inbounds ptr, ptr %Arg, i32 %i
446  %p2 = load atomic ptr, ptr %p1 monotonic, align 8
447  store float 0.000000e+00, ptr %p2, align 4
448  ret void
449}
450
451; GCN-LABEL: cast_changing_pointee_type:
452; GCN: s_lshl_b64
453; GCN: s_load_dwordx2
454; GCN: s_load_dwordx2
455; GCN: global_store_dword
456define amdgpu_kernel void @cast_changing_pointee_type(ptr addrspace(1) nocapture readonly %Arg, i32 %i) {
457; CHECK-LABEL: @cast_changing_pointee_type(
458; CHECK-NEXT:  entry:
459; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[ARG:%.*]], i32 [[I:%.*]]
460; CHECK-NEXT:    [[P2:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber [[META0]]
461; CHECK-NEXT:    [[P3:%.*]] = load ptr, ptr addrspace(1) [[P2]], align 8, !amdgpu.noclobber [[META0]]
462; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
463; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4
464; CHECK-NEXT:    ret void
465;
466entry:
467  %p1 = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %Arg, i32 %i
468  %a1 = addrspacecast ptr addrspace(1) %p1 to ptr
469  %p2 = load ptr addrspace(1), ptr %a1, align 8
470  %a2 = addrspacecast ptr addrspace(1) %p2 to ptr
471  %p3 = load ptr, ptr %a2, align 8
472  store float 0.000000e+00, ptr %p3, align 4
473  ret void
474}
475
476declare i32 @llvm.amdgcn.workitem.id.x()
477declare void @llvm.amdgcn.s.barrier()
478