xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s
4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
5
6define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 {
7; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
8; GFX90A:       ; %bb.0:
9; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
11; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
12; GFX90A-NEXT:    buffer_atomic_add_f32 v0, v[2:3], s[16:19], s20 idxen offen glc
13; GFX90A-NEXT:    s_waitcnt vmcnt(0)
14; GFX90A-NEXT:    s_setpc_b64 s[30:31]
15;
16; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
17; GFX940:       ; %bb.0:
18; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; GFX940-NEXT:    v_mov_b32_e32 v3, v2
20; GFX940-NEXT:    v_mov_b32_e32 v2, v1
21; GFX940-NEXT:    buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen sc0
22; GFX940-NEXT:    s_waitcnt vmcnt(0)
23; GFX940-NEXT:    s_setpc_b64 s[30:31]
24;
25; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
26; GFX1200:       ; %bb.0:
27; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
28; GFX1200-NEXT:    s_wait_expcnt 0x0
29; GFX1200-NEXT:    s_wait_samplecnt 0x0
30; GFX1200-NEXT:    s_wait_bvhcnt 0x0
31; GFX1200-NEXT:    s_wait_kmcnt 0x0
32; GFX1200-NEXT:    buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
33; GFX1200-NEXT:    s_wait_loadcnt 0x0
34; GFX1200-NEXT:    s_setpc_b64 s[30:31]
35  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
36  ret float %ret
37}
38
39; Natural mapping, no voffset
40define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) #0 {
41; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
42; GFX90A:       ; %bb.0:
43; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
44; GFX90A-NEXT:    buffer_atomic_add_f32 v0, v1, s[16:19], s20 idxen glc
45; GFX90A-NEXT:    s_waitcnt vmcnt(0)
46; GFX90A-NEXT:    s_setpc_b64 s[30:31]
47;
48; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
49; GFX940:       ; %bb.0:
50; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen sc0
52; GFX940-NEXT:    s_waitcnt vmcnt(0)
53; GFX940-NEXT:    s_setpc_b64 s[30:31]
54;
55; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
56; GFX1200:       ; %bb.0:
57; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
58; GFX1200-NEXT:    s_wait_expcnt 0x0
59; GFX1200-NEXT:    s_wait_samplecnt 0x0
60; GFX1200-NEXT:    s_wait_bvhcnt 0x0
61; GFX1200-NEXT:    s_wait_kmcnt 0x0
62; GFX1200-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
63; GFX1200-NEXT:    s_wait_loadcnt 0x0
64; GFX1200-NEXT:    s_setpc_b64 s[30:31]
65  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
66  ret float %ret
67}
68
69define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 {
70; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
71; GFX90A:       ; %bb.0:
72; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
74; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
75; GFX90A-NEXT:    buffer_atomic_add_f32 v0, v[2:3], s[16:19], s20 idxen offen glc slc
76; GFX90A-NEXT:    s_waitcnt vmcnt(0)
77; GFX90A-NEXT:    s_setpc_b64 s[30:31]
78;
79; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
80; GFX940:       ; %bb.0:
81; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82; GFX940-NEXT:    v_mov_b32_e32 v3, v2
83; GFX940-NEXT:    v_mov_b32_e32 v2, v1
84; GFX940-NEXT:    buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen sc0 nt
85; GFX940-NEXT:    s_waitcnt vmcnt(0)
86; GFX940-NEXT:    s_setpc_b64 s[30:31]
87;
88; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
89; GFX1200:       ; %bb.0:
90; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
91; GFX1200-NEXT:    s_wait_expcnt 0x0
92; GFX1200-NEXT:    s_wait_samplecnt 0x0
93; GFX1200-NEXT:    s_wait_bvhcnt 0x0
94; GFX1200-NEXT:    s_wait_kmcnt 0x0
95; GFX1200-NEXT:    buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
96; GFX1200-NEXT:    s_wait_loadcnt 0x0
97; GFX1200-NEXT:    s_setpc_b64 s[30:31]
98  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
99  ret float %ret
100}
101
102define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 {
103; GFX90A-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
104; GFX90A:       ; %bb.0:
105; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
107; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
108; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v0, v[2:3], s[16:19], s20 idxen offen glc
109; GFX90A-NEXT:    s_waitcnt vmcnt(0)
110; GFX90A-NEXT:    s_setpc_b64 s[30:31]
111;
112; GFX940-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
113; GFX940:       ; %bb.0:
114; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115; GFX940-NEXT:    v_mov_b32_e32 v3, v2
116; GFX940-NEXT:    v_mov_b32_e32 v2, v1
117; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s16 idxen offen sc0
118; GFX940-NEXT:    s_waitcnt vmcnt(0)
119; GFX940-NEXT:    s_setpc_b64 s[30:31]
120;
121; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
122; GFX1200:       ; %bb.0:
123; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
124; GFX1200-NEXT:    s_wait_expcnt 0x0
125; GFX1200-NEXT:    s_wait_samplecnt 0x0
126; GFX1200-NEXT:    s_wait_bvhcnt 0x0
127; GFX1200-NEXT:    s_wait_kmcnt 0x0
128; GFX1200-NEXT:    buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
129; GFX1200-NEXT:    s_wait_loadcnt 0x0
130; GFX1200-NEXT:    s_setpc_b64 s[30:31]
131  %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
132  ret <2 x half> %ret
133}
134
135; Test waterfall loop
136define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) #0 {
137; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
138; GFX90A:       ; %bb.0:
139; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
141; GFX90A-NEXT:    v_mov_b32_e32 v8, v5
142; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
143; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
144; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
145; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
146; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
147; GFX90A-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
148; GFX90A-NEXT:    v_readfirstlane_b32 s8, v2
149; GFX90A-NEXT:    v_readfirstlane_b32 s9, v3
150; GFX90A-NEXT:    v_readfirstlane_b32 s10, v4
151; GFX90A-NEXT:    v_readfirstlane_b32 s11, v5
152; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
153; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
154; GFX90A-NEXT:    v_readfirstlane_b32 s12, v7
155; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
156; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v7
157; GFX90A-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
158; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
159; GFX90A-NEXT:    s_waitcnt vmcnt(0)
160; GFX90A-NEXT:    buffer_atomic_add_f32 v0, v[8:9], s[8:11], s12 idxen offen glc
161; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
162; GFX90A-NEXT:    ; implicit-def: $vgpr7
163; GFX90A-NEXT:    ; implicit-def: $vgpr8_vgpr9
164; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
165; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
166; GFX90A-NEXT:  ; %bb.2:
167; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
168; GFX90A-NEXT:    s_waitcnt vmcnt(0)
169; GFX90A-NEXT:    s_setpc_b64 s[30:31]
170;
171; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
172; GFX940:       ; %bb.0:
173; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
174; GFX940-NEXT:    v_mov_b32_e32 v9, v6
175; GFX940-NEXT:    v_mov_b32_e32 v8, v5
176; GFX940-NEXT:    v_mov_b32_e32 v5, v4
177; GFX940-NEXT:    v_mov_b32_e32 v4, v3
178; GFX940-NEXT:    v_mov_b32_e32 v3, v2
179; GFX940-NEXT:    v_mov_b32_e32 v2, v1
180; GFX940-NEXT:    s_mov_b64 s[2:3], exec
181; GFX940-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
182; GFX940-NEXT:    v_readfirstlane_b32 s4, v2
183; GFX940-NEXT:    v_readfirstlane_b32 s5, v3
184; GFX940-NEXT:    v_readfirstlane_b32 s6, v4
185; GFX940-NEXT:    v_readfirstlane_b32 s7, v5
186; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
187; GFX940-NEXT:    v_readfirstlane_b32 s8, v7
188; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[4:5]
189; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
190; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v7
191; GFX940-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
192; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
193; GFX940-NEXT:    s_waitcnt vmcnt(0)
194; GFX940-NEXT:    buffer_atomic_add_f32 v0, v[8:9], s[4:7], s8 idxen offen sc0
195; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
196; GFX940-NEXT:    ; implicit-def: $vgpr7
197; GFX940-NEXT:    ; implicit-def: $vgpr8_vgpr9
198; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
199; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
200; GFX940-NEXT:  ; %bb.2:
201; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
202; GFX940-NEXT:    s_waitcnt vmcnt(0)
203; GFX940-NEXT:    s_setpc_b64 s[30:31]
204;
205; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
206; GFX1200:       ; %bb.0:
207; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
208; GFX1200-NEXT:    s_wait_expcnt 0x0
209; GFX1200-NEXT:    s_wait_samplecnt 0x0
210; GFX1200-NEXT:    s_wait_bvhcnt 0x0
211; GFX1200-NEXT:    s_wait_kmcnt 0x0
212; GFX1200-NEXT:    s_mov_b32 s2, exec_lo
213; GFX1200-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
214; GFX1200-NEXT:    v_readfirstlane_b32 s4, v1
215; GFX1200-NEXT:    v_readfirstlane_b32 s5, v2
216; GFX1200-NEXT:    v_readfirstlane_b32 s6, v3
217; GFX1200-NEXT:    v_readfirstlane_b32 s7, v4
218; GFX1200-NEXT:    v_readfirstlane_b32 s3, v7
219; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
220; GFX1200-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
221; GFX1200-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
222; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
223; GFX1200-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
224; GFX1200-NEXT:    s_wait_alu 0xfffe
225; GFX1200-NEXT:    s_and_b32 s0, vcc_lo, s0
226; GFX1200-NEXT:    s_wait_alu 0xfffe
227; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
228; GFX1200-NEXT:    s_and_b32 s0, s0, s1
229; GFX1200-NEXT:    s_wait_alu 0xfffe
230; GFX1200-NEXT:    s_and_saveexec_b32 s0, s0
231; GFX1200-NEXT:    s_wait_loadcnt 0x0
232; GFX1200-NEXT:    buffer_atomic_add_f32 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
233; GFX1200-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
234; GFX1200-NEXT:    ; implicit-def: $vgpr7
235; GFX1200-NEXT:    ; implicit-def: $vgpr5_vgpr6
236; GFX1200-NEXT:    s_wait_alu 0xfffe
237; GFX1200-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
238; GFX1200-NEXT:    s_cbranch_execnz .LBB4_1
239; GFX1200-NEXT:  ; %bb.2:
240; GFX1200-NEXT:    s_mov_b32 exec_lo, s2
241; GFX1200-NEXT:    s_wait_loadcnt 0x0
242; GFX1200-NEXT:    s_wait_alu 0xfffe
243; GFX1200-NEXT:    s_setpc_b64 s[30:31]
244  %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
245  ret float %ret
246}
247
248; Test waterfall loop
249define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) #0 {
250; GFX90A-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
251; GFX90A:       ; %bb.0:
252; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
253; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
254; GFX90A-NEXT:    v_mov_b32_e32 v8, v5
255; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
256; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
257; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
258; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
259; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
260; GFX90A-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
261; GFX90A-NEXT:    v_readfirstlane_b32 s8, v2
262; GFX90A-NEXT:    v_readfirstlane_b32 s9, v3
263; GFX90A-NEXT:    v_readfirstlane_b32 s10, v4
264; GFX90A-NEXT:    v_readfirstlane_b32 s11, v5
265; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
266; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
267; GFX90A-NEXT:    v_readfirstlane_b32 s12, v7
268; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
269; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v7
270; GFX90A-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
271; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
272; GFX90A-NEXT:    s_waitcnt vmcnt(0)
273; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v0, v[8:9], s[8:11], s12 idxen offen glc
274; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
275; GFX90A-NEXT:    ; implicit-def: $vgpr7
276; GFX90A-NEXT:    ; implicit-def: $vgpr8_vgpr9
277; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
278; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
279; GFX90A-NEXT:  ; %bb.2:
280; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
281; GFX90A-NEXT:    s_waitcnt vmcnt(0)
282; GFX90A-NEXT:    s_setpc_b64 s[30:31]
283;
284; GFX940-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
285; GFX940:       ; %bb.0:
286; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
287; GFX940-NEXT:    v_mov_b32_e32 v9, v6
288; GFX940-NEXT:    v_mov_b32_e32 v8, v5
289; GFX940-NEXT:    v_mov_b32_e32 v5, v4
290; GFX940-NEXT:    v_mov_b32_e32 v4, v3
291; GFX940-NEXT:    v_mov_b32_e32 v3, v2
292; GFX940-NEXT:    v_mov_b32_e32 v2, v1
293; GFX940-NEXT:    s_mov_b64 s[2:3], exec
294; GFX940-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
295; GFX940-NEXT:    v_readfirstlane_b32 s4, v2
296; GFX940-NEXT:    v_readfirstlane_b32 s5, v3
297; GFX940-NEXT:    v_readfirstlane_b32 s6, v4
298; GFX940-NEXT:    v_readfirstlane_b32 s7, v5
299; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
300; GFX940-NEXT:    v_readfirstlane_b32 s8, v7
301; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[4:5]
302; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
303; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v7
304; GFX940-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
305; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
306; GFX940-NEXT:    s_waitcnt vmcnt(0)
307; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s8 idxen offen sc0
308; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
309; GFX940-NEXT:    ; implicit-def: $vgpr7
310; GFX940-NEXT:    ; implicit-def: $vgpr8_vgpr9
311; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
312; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
313; GFX940-NEXT:  ; %bb.2:
314; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
315; GFX940-NEXT:    s_waitcnt vmcnt(0)
316; GFX940-NEXT:    s_setpc_b64 s[30:31]
317;
318; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
319; GFX1200:       ; %bb.0:
320; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
321; GFX1200-NEXT:    s_wait_expcnt 0x0
322; GFX1200-NEXT:    s_wait_samplecnt 0x0
323; GFX1200-NEXT:    s_wait_bvhcnt 0x0
324; GFX1200-NEXT:    s_wait_kmcnt 0x0
325; GFX1200-NEXT:    s_mov_b32 s2, exec_lo
326; GFX1200-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
327; GFX1200-NEXT:    v_readfirstlane_b32 s4, v1
328; GFX1200-NEXT:    v_readfirstlane_b32 s5, v2
329; GFX1200-NEXT:    v_readfirstlane_b32 s6, v3
330; GFX1200-NEXT:    v_readfirstlane_b32 s7, v4
331; GFX1200-NEXT:    v_readfirstlane_b32 s3, v7
332; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
333; GFX1200-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
334; GFX1200-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
335; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
336; GFX1200-NEXT:    v_cmp_eq_u32_e64 s1, s3, v7
337; GFX1200-NEXT:    s_wait_alu 0xfffe
338; GFX1200-NEXT:    s_and_b32 s0, vcc_lo, s0
339; GFX1200-NEXT:    s_wait_alu 0xfffe
340; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
341; GFX1200-NEXT:    s_and_b32 s0, s0, s1
342; GFX1200-NEXT:    s_wait_alu 0xfffe
343; GFX1200-NEXT:    s_and_saveexec_b32 s0, s0
344; GFX1200-NEXT:    s_wait_loadcnt 0x0
345; GFX1200-NEXT:    buffer_atomic_pk_add_f16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
346; GFX1200-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
347; GFX1200-NEXT:    ; implicit-def: $vgpr7
348; GFX1200-NEXT:    ; implicit-def: $vgpr5_vgpr6
349; GFX1200-NEXT:    s_wait_alu 0xfffe
350; GFX1200-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
351; GFX1200-NEXT:    s_cbranch_execnz .LBB5_1
352; GFX1200-NEXT:  ; %bb.2:
353; GFX1200-NEXT:    s_mov_b32 exec_lo, s2
354; GFX1200-NEXT:    s_wait_loadcnt 0x0
355; GFX1200-NEXT:    s_wait_alu 0xfffe
356; GFX1200-NEXT:    s_setpc_b64 s[30:31]
357  %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
358  ret <2 x half> %ret
359}
360
361declare float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg)
362declare <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg)
363
364attributes #0 = { nounwind }
365