xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll (revision 9afaf9c6c89efb22bccab39677e8dff47da91a00)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK
3; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK
4
5define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %ptr) {
6; CHECK-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
7; CHECK:       ; %bb.0: ; %bb
8; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
9; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
10; CHECK-NEXT:    s_mov_b32 s4, 0
11; CHECK-NEXT:  .LBB0_1: ; %bb1
12; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
13; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
14; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
15; CHECK-NEXT:    s_waitcnt vmcnt(0)
16; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
17; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
18; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
19; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
20; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
21; CHECK-NEXT:  ; %bb.2: ; %bb2
22; CHECK-NEXT:    s_endpgm
23bb:
24  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
25  br label %bb1
26bb1:
27  %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 0, i32 0, i32 1)
28  %cmp = icmp eq i32 %load, %id
29  br i1 %cmp, label %bb1, label %bb2
30bb2:
31  ret void
32}
33
34define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %ptr) {
35; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_off:
36; CHECK:       ; %bb.0: ; %bb
37; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
38; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
39; CHECK-NEXT:    s_mov_b32 s4, 0
40; CHECK-NEXT:  .LBB1_1: ; %bb1
41; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
42; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
43; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
44; CHECK-NEXT:    s_waitcnt vmcnt(0)
45; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
46; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
47; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
48; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
49; CHECK-NEXT:    s_cbranch_execnz .LBB1_1
50; CHECK-NEXT:  ; %bb.2: ; %bb2
51; CHECK-NEXT:    s_endpgm
52bb:
53  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
54  br label %bb1
55bb1:
56  %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 0, i32 0, i32 1)
57  %cmp = icmp eq i32 %load, %id
58  br i1 %cmp, label %bb1, label %bb2
59bb2:
60  ret void
61}
62define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) %ptr) {
63; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
64; CHECK:       ; %bb.0: ; %bb
65; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
66; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
67; CHECK-NEXT:    s_mov_b32 s4, 0
68; CHECK-NEXT:  .LBB2_1: ; %bb1
69; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
70; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
71; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
72; CHECK-NEXT:    s_waitcnt vmcnt(0)
73; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
74; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
75; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
76; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
77; CHECK-NEXT:    s_cbranch_execnz .LBB2_1
78; CHECK-NEXT:  ; %bb.2: ; %bb2
79; CHECK-NEXT:    s_endpgm
80bb:
81  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
82  br label %bb1
83bb1:
84  %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 4, i32 4, i32 1)
85  %cmp = icmp eq i32 %load, %id
86  br i1 %cmp, label %bb1, label %bb2
87bb2:
88  ret void
89}
90define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %ptr) {
91; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
92; CHECK:       ; %bb.0: ; %bb
93; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
94; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
95; CHECK-NEXT:    s_mov_b32 s4, 0
96; CHECK-NEXT:  .LBB3_1: ; %bb1
97; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
98; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
99; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
100; CHECK-NEXT:    s_waitcnt vmcnt(0)
101; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
102; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
103; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
104; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
105; CHECK-NEXT:    s_cbranch_execnz .LBB3_1
106; CHECK-NEXT:  ; %bb.2: ; %bb2
107; CHECK-NEXT:    s_endpgm
108bb:
109  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
110  br label %bb1
111bb1:
112  %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 4, i32 0, i32 4)
113  %cmp = icmp eq i32 %load, %id
114  br i1 %cmp, label %bb1, label %bb2
115bb2:
116  ret void
117}
118
119define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr) {
120; CHECK-LABEL: raw_nonptr_atomic_buffer_load_i32:
121; CHECK:       ; %bb.0: ; %bb
122; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
123; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
124; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
125; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
126; CHECK-NEXT:    s_mov_b32 s0, 0
127; CHECK-NEXT:    s_waitcnt vmcnt(0)
128; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
129; CHECK-NEXT:  .LBB4_1: ; %bb1
130; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
131; CHECK-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
132; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
133; CHECK-NEXT:    s_or_b32 s0, s1, s0
134; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
135; CHECK-NEXT:    s_cbranch_execnz .LBB4_1
136; CHECK-NEXT:  ; %bb.2: ; %bb2
137; CHECK-NEXT:    s_endpgm
138bb:
139  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
140  br label %bb1
141bb1:
142  %load = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
143  %cmp = icmp eq i32 %load, %id
144  br i1 %cmp, label %bb1, label %bb2
145bb2:
146  ret void
147}
148
149define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) {
150; CHECK-LABEL: raw_ptr_atomic_buffer_load_i64:
151; CHECK:       ; %bb.0: ; %bb
152; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
153; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
154; CHECK-NEXT:    s_mov_b32 s4, 0
155; CHECK-NEXT:  .LBB5_1: ; %bb1
156; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
157; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
158; CHECK-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
159; CHECK-NEXT:    s_waitcnt vmcnt(0)
160; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
161; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
162; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
163; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
164; CHECK-NEXT:    s_cbranch_execnz .LBB5_1
165; CHECK-NEXT:  ; %bb.2: ; %bb2
166; CHECK-NEXT:    s_endpgm
167bb:
168  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
169  %id.zext = zext i32 %id to i64
170  br label %bb1
171bb1:
172  %load = call i64 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i64(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
173  %cmp = icmp eq i64 %load, %id.zext
174  br i1 %cmp, label %bb1, label %bb2
175bb2:
176  ret void
177}
178
179define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %ptr) {
180; CHECK-LABEL: raw_ptr_atomic_buffer_load_v2i16:
181; CHECK:       ; %bb.0: ; %bb
182; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
183; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
184; CHECK-NEXT:    s_mov_b32 s4, 0
185; CHECK-NEXT:  .LBB6_1: ; %bb1
186; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
187; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
188; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
189; CHECK-NEXT:    s_waitcnt vmcnt(0)
190; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
191; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
192; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
193; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
194; CHECK-NEXT:    s_cbranch_execnz .LBB6_1
195; CHECK-NEXT:  ; %bb.2: ; %bb2
196; CHECK-NEXT:    s_endpgm
197bb:
198  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
199  br label %bb1
200bb1:
201  %load = call <2 x i16> @llvm.amdgcn.raw.ptr.atomic.buffer.load.v2i16(ptr addrspace(8) %ptr, i32 0, i32 0, i32 1)
202  %bitcast = bitcast <2 x i16> %load to i32
203  %cmp = icmp eq i32 %bitcast, %id
204  br i1 %cmp, label %bb1, label %bb2
205bb2:
206  ret void
207}
208
209define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %ptr) {
210; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i16:
211; CHECK:       ; %bb.0: ; %bb
212; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
213; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
214; CHECK-NEXT:    s_mov_b32 s4, 0
215; CHECK-NEXT:  .LBB7_1: ; %bb1
216; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
217; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
218; CHECK-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
219; CHECK-NEXT:    s_waitcnt vmcnt(0)
220; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
221; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
222; CHECK-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
223; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
224; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
225; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
226; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
227; CHECK-NEXT:    s_cbranch_execnz .LBB7_1
228; CHECK-NEXT:  ; %bb.2: ; %bb2
229; CHECK-NEXT:    s_endpgm
230bb:
231  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
232  br label %bb1
233bb1:
234  %load = call <4 x i16> @llvm.amdgcn.raw.ptr.atomic.buffer.load.v4i16(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
235  %shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 2>
236  %bitcast = bitcast <2 x i16> %shortened to i32
237  %cmp = icmp eq i32 %bitcast, %id
238  br i1 %cmp, label %bb1, label %bb2
239bb2:
240  ret void
241}
242
243define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %ptr) {
244; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i32:
245; CHECK:       ; %bb.0: ; %bb
246; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
247; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
248; CHECK-NEXT:    s_mov_b32 s4, 0
249; CHECK-NEXT:  .LBB8_1: ; %bb1
250; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
251; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
252; CHECK-NEXT:    buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
253; CHECK-NEXT:    s_waitcnt vmcnt(0)
254; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v0
255; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
256; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
257; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
258; CHECK-NEXT:    s_cbranch_execnz .LBB8_1
259; CHECK-NEXT:  ; %bb.2: ; %bb2
260; CHECK-NEXT:    s_endpgm
261bb:
262  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
263  br label %bb1
264bb1:
265  %load = call <4 x i32> @llvm.amdgcn.raw.ptr.atomic.buffer.load.v4i32(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
266  %extracted = extractelement <4 x i32> %load, i32 3
267  %cmp = icmp eq i32 %extracted, %id
268  br i1 %cmp, label %bb1, label %bb2
269bb2:
270  ret void
271}
272
273define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr) {
274; CHECK-LABEL: raw_ptr_atomic_buffer_load_ptr:
275; CHECK:       ; %bb.0: ; %bb
276; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
277; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
278; CHECK-NEXT:    s_mov_b32 s4, 0
279; CHECK-NEXT:  .LBB9_1: ; %bb1
280; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
281; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
282; CHECK-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
283; CHECK-NEXT:    s_waitcnt vmcnt(0)
284; CHECK-NEXT:    flat_load_b32 v1, v[1:2]
285; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
286; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
287; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
288; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
289; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
290; CHECK-NEXT:    s_cbranch_execnz .LBB9_1
291; CHECK-NEXT:  ; %bb.2: ; %bb2
292; CHECK-NEXT:    s_endpgm
293bb:
294  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
295  br label %bb1
296bb1:
297  %load = call ptr @llvm.amdgcn.raw.ptr.atomic.buffer.load.ptr(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
298  %elem = load i32, ptr %load
299  %cmp = icmp eq i32 %elem, %id
300  br i1 %cmp, label %bb1, label %bb2
301bb2:
302  ret void
303}
304
305; Function Attrs: nounwind readonly
306declare i32 @llvm.amdgcn.raw.ptr.atom.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg)
307declare i64 @llvm.amdgcn.raw.ptr.atom.buffer.load.i64(ptr addrspace(8), i32, i32, i32 immarg)
308declare <2 x i16> @llvm.amdgcn.raw.ptr.atom.buffer.load.v2i16(ptr addrspace(8), i32, i32, i32 immarg)
309declare <4 x i16> @llvm.amdgcn.raw.ptr.atom.buffer.load.v4i16(ptr addrspace(8), i32, i32, i32 immarg)
310declare <4 x i32> @llvm.amdgcn.raw.ptr.atom.buffer.load.v4i32(ptr addrspace(8), i32, i32, i32 immarg)
311declare ptr @llvm.amdgcn.raw.ptr.atom.buffer.load.ptr(ptr addrspace(8), i32, i32, i32 immarg)
312declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg)
313declare i32 @llvm.amdgcn.workitem.id.x()
314