xref: /llvm-project/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -mtriple=amdgcn -mcpu=gfx900 -amdgpu-aa -amdgpu-aa-wrapper -amdgpu-annotate-uniform -S < %s | FileCheck %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefix=GCN %s
4
5; Check that barrier or fence in between of loads is not considered a clobber
6; for the purpose of converting vector loads into scalar.
7
8@LDS = linkonce_odr hidden local_unnamed_addr addrspace(3) global i32 undef
9
10; GCN-LABEL: {{^}}simple_barrier:
11; GCN: s_load_dword s
12; GCN: s_waitcnt lgkmcnt(0)
13; GCN: s_barrier
14; GCN: s_waitcnt lgkmcnt(0)
15; GCN: ; wave barrier
16; GCN-NOT: global_load_dword
17; GCN: s_load_dword s
18; GCN-NOT: global_load_dword
19; GCN: global_store_dword
20define amdgpu_kernel void @simple_barrier(ptr addrspace(1) %arg) {
21; CHECK-LABEL: @simple_barrier(
22; CHECK-NEXT:  bb:
23; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
24; CHECK-NEXT:    fence syncscope("workgroup") release
25; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
26; CHECK-NEXT:    fence syncscope("workgroup") acquire
27; CHECK-NEXT:    tail call void @llvm.amdgcn.wave.barrier()
28; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
29; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0
30; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
31; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
32; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
33; CHECK-NEXT:    ret void
34;
35bb:
36  %i = load i32, ptr addrspace(1) %arg, align 4
37  fence syncscope("workgroup") release
38  tail call void @llvm.amdgcn.s.barrier()
39  fence syncscope("workgroup") acquire
40  tail call void @llvm.amdgcn.wave.barrier()
41  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
42  %i2 = load i32, ptr addrspace(1) %i1, align 4
43  %i3 = add i32 %i2, %i
44  %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
45  store i32 %i3, ptr addrspace(1) %i4, align 4
46  ret void
47}
48
49; GCN-LABEL: {{^}}memory_phi_no_clobber:
50; GCN: s_load_dword s
51; GCN: s_waitcnt lgkmcnt(0)
52; GCN: s_waitcnt lgkmcnt(0)
53; GCN: s_barrier
54; GCN-NOT: global_load_dword
55; GCN: s_load_dword s
56; GCN-NOT: global_load_dword
57; GCN: global_store_dword
58define amdgpu_kernel void @memory_phi_no_clobber(ptr addrspace(1) %arg) {
59; CHECK-LABEL: @memory_phi_no_clobber(
60; CHECK-NEXT:  bb:
61; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
62; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
63; CHECK:       if.then:
64; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
65; CHECK-NEXT:    br label [[IF_END:%.*]], !amdgpu.uniform !0
66; CHECK:       if.else:
67; CHECK-NEXT:    fence syncscope("workgroup") release
68; CHECK-NEXT:    br label [[IF_END]], !amdgpu.uniform !0
69; CHECK:       if.end:
70; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
71; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0
72; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
73; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
74; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
75; CHECK-NEXT:    ret void
76;
77bb:
78  %i = load i32, ptr addrspace(1) %arg, align 4
79  br i1 undef, label %if.then, label %if.else
80
81if.then:
82  tail call void @llvm.amdgcn.s.barrier()
83  br label %if.end
84
85if.else:
86  fence syncscope("workgroup") release
87  br label %if.end
88
89if.end:
90  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
91  %i2 = load i32, ptr addrspace(1) %i1, align 4
92  %i3 = add i32 %i2, %i
93  %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
94  store i32 %i3, ptr addrspace(1) %i4, align 4
95  ret void
96}
97
98; GCN-LABEL: {{^}}memory_phi_clobber1:
99; GCN: s_load_dword s
100; GCN: s_barrier
101; GCN: global_store_dword
102; GCN: global_load_dword
103; GCN: global_store_dword
104define amdgpu_kernel void @memory_phi_clobber1(ptr addrspace(1) %arg) {
105; CHECK-LABEL: @memory_phi_clobber1(
106; CHECK-NEXT:  bb:
107; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
108; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
109; CHECK:       if.then:
110; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
111; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[GEP]], align 4
112; CHECK-NEXT:    br label [[IF_END:%.*]], !amdgpu.uniform !0
113; CHECK:       if.else:
114; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
115; CHECK-NEXT:    br label [[IF_END]], !amdgpu.uniform !0
116; CHECK:       if.end:
117; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
118; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4
119; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
120; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
121; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
122; CHECK-NEXT:    ret void
123;
124bb:
125  %i = load i32, ptr addrspace(1) %arg, align 4
126  br i1 undef, label %if.then, label %if.else
127
128if.then:
129  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
130  store i32 1, ptr addrspace(1) %gep, align 4
131  br label %if.end
132
133if.else:
134  tail call void @llvm.amdgcn.s.barrier()
135  br label %if.end
136
137if.end:
138  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
139  %i2 = load i32, ptr addrspace(1) %i1, align 4
140  %i3 = add i32 %i2, %i
141  %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
142  store i32 %i3, ptr addrspace(1) %i4, align 4
143  ret void
144}
145
146; GCN-LABEL: {{^}}memory_phi_clobber2:
147; GCN-DAG: s_load_dword s
148; GCN-DAG: global_store_dword
149; GCN: s_barrier
150; GCN: global_load_dword
151; GCN: global_store_dword
152define amdgpu_kernel void @memory_phi_clobber2(ptr addrspace(1) %arg) {
153; CHECK-LABEL: @memory_phi_clobber2(
154; CHECK-NEXT:  bb:
155; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
156; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
157; CHECK:       if.then:
158; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
159; CHECK-NEXT:    br label [[IF_END:%.*]], !amdgpu.uniform !0
160; CHECK:       if.else:
161; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
162; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[GEP]], align 4
163; CHECK-NEXT:    br label [[IF_END]], !amdgpu.uniform !0
164; CHECK:       if.end:
165; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
166; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4
167; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
168; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
169; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
170; CHECK-NEXT:    ret void
171;
172bb:
173  %i = load i32, ptr addrspace(1) %arg, align 4
174  br i1 undef, label %if.then, label %if.else
175
176if.then:
177  tail call void @llvm.amdgcn.s.barrier()
178  br label %if.end
179
180if.else:
181  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
182  store i32 1, ptr addrspace(1) %gep, align 4
183  br label %if.end
184
185if.end:
186  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
187  %i2 = load i32, ptr addrspace(1) %i1, align 4
188  %i3 = add i32 %i2, %i
189  %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
190  store i32 %i3, ptr addrspace(1) %i4, align 4
191  ret void
192}
193
194; GCN-LABEL: {{^}}no_clobbering_loop1:
195; GCN: s_load_dword s
196; GCN: s_load_dword s
197; GCN-NOT: global_load_dword
198; GCN: global_store_dword
199define amdgpu_kernel void @no_clobbering_loop1(ptr addrspace(1) %arg, i1 %cc) {
200; CHECK-LABEL: @no_clobbering_loop1(
201; CHECK-NEXT:  bb:
202; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
203; CHECK-NEXT:    br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
204; CHECK:       while.cond:
205; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
206; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0
207; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
208; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
209; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
210; CHECK-NEXT:    tail call void @llvm.amdgcn.wave.barrier()
211; CHECK-NEXT:    br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0
212; CHECK:       end:
213; CHECK-NEXT:    ret void
214;
215bb:
216  %i = load i32, ptr addrspace(1) %arg, align 4
217  br label %while.cond
218
219while.cond:
220  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
221  %i2 = load i32, ptr addrspace(1) %i1, align 4
222  %i3 = add i32 %i2, %i
223  %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
224  store i32 %i3, ptr addrspace(1) %i4, align 4
225  tail call void @llvm.amdgcn.wave.barrier()
226  br i1 %cc, label %while.cond, label %end
227
228end:
229  ret void
230}
231
232; GCN-LABEL: {{^}}no_clobbering_loop2:
233; GCN: s_load_dword s
234; GCN: s_load_dword s
235; GCN-NOT: global_load_dword
236; GCN: global_store_dword
237define amdgpu_kernel void @no_clobbering_loop2(ptr addrspace(1) noalias %arg, ptr addrspace(1) noalias %out, i32 %n) {
238; CHECK-LABEL: @no_clobbering_loop2(
239; CHECK-NEXT:  bb:
240; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
241; CHECK-NEXT:    br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
242; CHECK:       while.cond:
243; CHECK-NEXT:    [[C:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
244; CHECK-NEXT:    [[ACC:%.*]] = phi i32 [ [[I]], [[BB]] ], [ [[I3:%.*]], [[WHILE_COND]] ]
245; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i32 [[C]], !amdgpu.uniform !0
246; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0
247; CHECK-NEXT:    [[I3]] = add i32 [[I2]], [[ACC]]
248; CHECK-NEXT:    tail call void @llvm.amdgcn.wave.barrier()
249; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[C]], 1
250; CHECK-NEXT:    [[CC:%.*]] = icmp eq i32 [[INC]], [[N:%.*]]
251; CHECK-NEXT:    br i1 [[CC]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0
252; CHECK:       end:
253; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[OUT:%.*]], align 4
254; CHECK-NEXT:    ret void
255;
256bb:
257  %i = load i32, ptr addrspace(1) %arg, align 4
258  br label %while.cond
259
260while.cond:
261  %c = phi i32 [ 0, %bb ], [ %inc, %while.cond ]
262  %acc = phi i32 [ %i, %bb ], [ %i3, %while.cond ]
263  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %c
264  %i2 = load i32, ptr addrspace(1) %i1, align 4
265  %i3 = add i32 %i2, %acc
266  tail call void @llvm.amdgcn.wave.barrier()
267  %inc = add nuw nsw i32 %c, 1
268  %cc = icmp eq i32 %inc, %n
269  br i1 %cc, label %while.cond, label %end
270
271end:
272  store i32 %i3, ptr addrspace(1) %out, align 4
273  ret void
274}
275
276; GCN-LABEL: {{^}}clobbering_loop:
277; GCN: s_load_dword s
278; GCN: global_load_dword
279; GCN: global_store_dword
280define amdgpu_kernel void @clobbering_loop(ptr addrspace(1) %arg, ptr addrspace(1) %out, i1 %cc) {
281; CHECK-LABEL: @clobbering_loop(
282; CHECK-NEXT:  bb:
283; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
284; CHECK-NEXT:    br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
285; CHECK:       while.cond:
286; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
287; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4
288; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
289; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT:%.*]], i64 1
290; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
291; CHECK-NEXT:    tail call void @llvm.amdgcn.wave.barrier()
292; CHECK-NEXT:    br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0
293; CHECK:       end:
294; CHECK-NEXT:    ret void
295;
296bb:
297  %i = load i32, ptr addrspace(1) %arg, align 4
298  br label %while.cond
299
300while.cond:
301  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
302  %i2 = load i32, ptr addrspace(1) %i1, align 4
303  %i3 = add i32 %i2, %i
304  %i4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
305  store i32 %i3, ptr addrspace(1) %i4, align 4
306  tail call void @llvm.amdgcn.wave.barrier()
307  br i1 %cc, label %while.cond, label %end
308
309end:
310  ret void
311}
312
313; GCN-LABEL: {{^}}clobber_by_atomic_load:
314; GCN: s_load_dword s
315; GCN: global_load_dword {{.*}} glc
316; GCN: global_load_dword
317; GCN: global_store_dword
318define amdgpu_kernel void @clobber_by_atomic_load(ptr addrspace(1) %arg) {
319; CHECK-LABEL: @clobber_by_atomic_load(
320; CHECK-NEXT:  bb:
321; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
322; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2, !amdgpu.uniform !0
323; CHECK-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[GEP]] seq_cst, align 4, !amdgpu.noclobber !0
324; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3, !amdgpu.uniform !0
325; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4
326; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
327; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 4
328; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
329; CHECK-NEXT:    ret void
330;
331bb:
332  %i = load i32, ptr addrspace(1) %arg, align 4
333  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
334  %val = load atomic i32, ptr addrspace(1) %gep  seq_cst, align 4
335  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
336  %i2 = load i32, ptr addrspace(1) %i1, align 4
337  %i3 = add i32 %i2, %i
338  %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 4
339  store i32 %i3, ptr addrspace(1) %i4, align 4
340  ret void
341}
342
343; GCN-LABEL: {{^}}no_alias_store:
344; GCN: ds_write_b32
345; GCN: s_barrier
346; GCN: s_load_dword s
347; GCN-NOT: global_load_dword
348; GCN: global_store_dword
349define protected amdgpu_kernel void @no_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
350; CHECK-LABEL: @no_alias_store(
351; CHECK-NEXT:  entry:
352; CHECK-NEXT:    store i32 0, ptr addrspace(3) @LDS, align 4
353; CHECK-NEXT:    fence syncscope("workgroup") release
354; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
355; CHECK-NEXT:    fence syncscope("workgroup") acquire
356; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
357; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
358; CHECK-NEXT:    ret void
359;
360entry:
361  store i32 0, ptr addrspace(3) @LDS, align 4
362  fence syncscope("workgroup") release
363  tail call void @llvm.amdgcn.s.barrier()
364  fence syncscope("workgroup") acquire
365  %ld = load i32, ptr addrspace(1) %in, align 4
366  store i32 %ld, ptr addrspace(1) %out, align 4
367  ret void
368}
369
370; GCN-LABEL: {{^}}may_alias_store:
371; GCN: global_store_dword
372; GCN: s_barrier
373; GCN: global_load_dword
374; GCN: global_store_dword
375define protected amdgpu_kernel void @may_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
376; CHECK-LABEL: @may_alias_store(
377; CHECK-NEXT:  entry:
378; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[OUT:%.*]], align 4
379; CHECK-NEXT:    fence syncscope("workgroup") release
380; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
381; CHECK-NEXT:    fence syncscope("workgroup") acquire
382; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4
383; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4
384; CHECK-NEXT:    ret void
385;
386entry:
387  store i32 0, ptr addrspace(1) %out, align 4
388  fence syncscope("workgroup") release
389  tail call void @llvm.amdgcn.s.barrier()
390  fence syncscope("workgroup") acquire
391  %ld = load i32, ptr addrspace(1) %in, align 4
392  store i32 %ld, ptr addrspace(1) %out, align 4
393  ret void
394}
395
396; GCN-LABEL: {{^}}no_alias_volatile_store:
397; GCN: ds_write_b32
398; GCN: s_barrier
399; GCN: s_load_dword s
400; GCN-NOT: global_load_dword
401; GCN: global_store_dword
402define protected amdgpu_kernel void @no_alias_volatile_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
403; CHECK-LABEL: @no_alias_volatile_store(
404; CHECK-NEXT:  entry:
405; CHECK-NEXT:    store volatile i32 0, ptr addrspace(3) @LDS, align 4
406; CHECK-NEXT:    fence syncscope("workgroup") release
407; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
408; CHECK-NEXT:    fence syncscope("workgroup") acquire
409; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
410; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
411; CHECK-NEXT:    ret void
412;
413entry:
414  store volatile i32 0, ptr addrspace(3) @LDS, align 4
415  fence syncscope("workgroup") release
416  tail call void @llvm.amdgcn.s.barrier()
417  fence syncscope("workgroup") acquire
418  %ld = load i32, ptr addrspace(1) %in, align 4
419  store i32 %ld, ptr addrspace(1) %out, align 4
420  ret void
421}
422
423; GCN-LABEL: {{^}}no_alias_atomic_rmw_relaxed:
424; GCN: ds_add_u32
425; GCN: s_load_dword s
426; GCN-NOT: global_load_dword
427; GCN: global_store_dword
428define protected amdgpu_kernel void @no_alias_atomic_rmw_relaxed(ptr addrspace(1) %in, ptr addrspace(1) %out) {
429; CHECK-LABEL: @no_alias_atomic_rmw_relaxed(
430; CHECK-NEXT:  entry:
431; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 monotonic, align 4
432; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
433; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
434; CHECK-NEXT:    ret void
435;
436entry:
437  %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 monotonic
438  %ld = load i32, ptr addrspace(1) %in, align 4
439  store i32 %ld, ptr addrspace(1) %out, align 4
440  ret void
441}
442
443; GCN-LABEL: {{^}}no_alias_atomic_cmpxchg:
444; GCN: ds_cmpst_b32
445; GCN: s_load_dword s
446; GCN-NOT: global_load_dword
447; GCN: global_store_dword
448define protected amdgpu_kernel void @no_alias_atomic_cmpxchg(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %swap) {
449; CHECK-LABEL: @no_alias_atomic_cmpxchg(
450; CHECK-NEXT:  entry:
451; CHECK-NEXT:    [[UNUSED:%.*]] = cmpxchg ptr addrspace(3) @LDS, i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4
452; CHECK-NEXT:    fence syncscope("workgroup") release
453; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
454; CHECK-NEXT:    fence syncscope("workgroup") acquire
455; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
456; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
457; CHECK-NEXT:    ret void
458;
459entry:
460  %unused = cmpxchg ptr addrspace(3) @LDS, i32 7, i32 %swap seq_cst monotonic
461  fence syncscope("workgroup") release
462  tail call void @llvm.amdgcn.s.barrier()
463  fence syncscope("workgroup") acquire
464  %ld = load i32, ptr addrspace(1) %in, align 4
465  store i32 %ld, ptr addrspace(1) %out, align 4
466  ret void
467}
468
469; GCN-LABEL: {{^}}no_alias_atomic_rmw:
470; GCN: ds_add_u32
471; GCN: s_load_dword s
472; GCN-NOT: global_load_dword
473; GCN: global_store_dword
474define protected amdgpu_kernel void @no_alias_atomic_rmw(ptr addrspace(1) %in, ptr addrspace(1) %out) {
475; CHECK-LABEL: @no_alias_atomic_rmw(
476; CHECK-NEXT:  entry:
477; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst, align 4
478; CHECK-NEXT:    fence syncscope("workgroup") release
479; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
480; CHECK-NEXT:    fence syncscope("workgroup") acquire
481; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
482; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
483; CHECK-NEXT:    ret void
484;
485entry:
486  %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst
487  fence syncscope("workgroup") release
488  tail call void @llvm.amdgcn.s.barrier()
489  fence syncscope("workgroup") acquire
490  %ld = load i32, ptr addrspace(1) %in, align 4
491  store i32 %ld, ptr addrspace(1) %out, align 4
492  ret void
493}
494
495; GCN-LABEL: {{^}}may_alias_atomic_cmpxchg:
496; GCN: global_atomic_cmpswap
497; GCN: global_load_dword
498; GCN: global_store_dword
499define protected amdgpu_kernel void @may_alias_atomic_cmpxchg(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %swap) {
500; CHECK-LABEL: @may_alias_atomic_cmpxchg(
501; CHECK-NEXT:  entry:
502; CHECK-NEXT:    [[UNUSED:%.*]] = cmpxchg ptr addrspace(1) [[OUT:%.*]], i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4
503; CHECK-NEXT:    fence syncscope("workgroup") release
504; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
505; CHECK-NEXT:    fence syncscope("workgroup") acquire
506; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4
507; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4
508; CHECK-NEXT:    ret void
509;
510entry:
511  %unused = cmpxchg ptr addrspace(1) %out, i32 7, i32 %swap seq_cst monotonic
512  fence syncscope("workgroup") release
513  tail call void @llvm.amdgcn.s.barrier()
514  fence syncscope("workgroup") acquire
515  %ld = load i32, ptr addrspace(1) %in, align 4
516  store i32 %ld, ptr addrspace(1) %out, align 4
517  ret void
518}
519
520; GCN-LABEL: {{^}}may_alias_atomic_rmw:
521; GCN: global_atomic_add
522; GCN: global_load_dword
523; GCN: global_store_dword
524define protected amdgpu_kernel void @may_alias_atomic_rmw(ptr addrspace(1) %in, ptr addrspace(1) %out) {
525; CHECK-LABEL: @may_alias_atomic_rmw(
526; CHECK-NEXT:  entry:
527; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add ptr addrspace(1) [[OUT:%.*]], i32 5 syncscope("agent") seq_cst, align 4
528; CHECK-NEXT:    fence syncscope("workgroup") release
529; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
530; CHECK-NEXT:    fence syncscope("workgroup") acquire
531; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4
532; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4
533; CHECK-NEXT:    ret void
534;
535entry:
536  %unused = atomicrmw add ptr addrspace(1) %out, i32 5 syncscope("agent") seq_cst
537  fence syncscope("workgroup") release
538  tail call void @llvm.amdgcn.s.barrier()
539  fence syncscope("workgroup") acquire
540  %ld = load i32, ptr addrspace(1) %in, align 4
541  store i32 %ld, ptr addrspace(1) %out, align 4
542  ret void
543}
544
545; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_clobber:
546; GCN: global_store_dword
547; GCN: global_store_dword
548; GCN: ds_add_u32
549; GCN: global_load_dword
550; GCN: global_store_dword
551define protected amdgpu_kernel void @no_alias_atomic_rmw_then_clobber(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(1) noalias %noalias) {
552; CHECK-LABEL: @no_alias_atomic_rmw_then_clobber(
553; CHECK-NEXT:  entry:
554; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[OUT:%.*]], align 4
555; CHECK-NEXT:    store i32 2, ptr addrspace(1) [[NOALIAS:%.*]], align 4
556; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst, align 4
557; CHECK-NEXT:    fence syncscope("workgroup") release
558; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
559; CHECK-NEXT:    fence syncscope("workgroup") acquire
560; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4
561; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4
562; CHECK-NEXT:    ret void
563;
564entry:
565  store i32 1, ptr addrspace(1) %out, align 4
566  store i32 2, ptr addrspace(1) %noalias, align 4
567  %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst
568  fence syncscope("workgroup") release
569  tail call void @llvm.amdgcn.s.barrier()
570  fence syncscope("workgroup") acquire
571  %ld = load i32, ptr addrspace(1) %in, align 4
572  store i32 %ld, ptr addrspace(1) %out, align 4
573  ret void
574}
575
576; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_no_alias_store:
577; GCN: global_store_dword
578; GCN: ds_add_u32
579; GCN: s_load_dword s
580; GCN-NOT: global_load_dword
581; GCN: global_store_dword
582define protected amdgpu_kernel void @no_alias_atomic_rmw_then_no_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(1) noalias %noalias) {
583; CHECK-LABEL: @no_alias_atomic_rmw_then_no_alias_store(
584; CHECK-NEXT:  entry:
585; CHECK-NEXT:    store i32 2, ptr addrspace(1) [[NOALIAS:%.*]], align 4
586; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst, align 4
587; CHECK-NEXT:    fence syncscope("workgroup") release
588; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
589; CHECK-NEXT:    fence syncscope("workgroup") acquire
590; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
591; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
592; CHECK-NEXT:    ret void
593;
594entry:
595  store i32 2, ptr addrspace(1) %noalias, align 4
596  %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst
597  fence syncscope("workgroup") release
598  tail call void @llvm.amdgcn.s.barrier()
599  fence syncscope("workgroup") acquire
600  %ld = load i32, ptr addrspace(1) %in, align 4
601  store i32 %ld, ptr addrspace(1) %out, align 4
602  ret void
603}
604
605declare void @llvm.amdgcn.s.barrier()
606declare void @llvm.amdgcn.wave.barrier()
607