xref: /llvm-project/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
9; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
10; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
11
12declare i32 @llvm.amdgcn.workitem.id.x()
13declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32, i32)
14declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32, ptr addrspace(8), i32, i32, i32, i32)
15
16; Show what the atomic optimization pass will do for struct buffers.
17
18define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
19; GFX6-LABEL: add_i32_constant:
20; GFX6:       ; %bb.0: ; %entry
21; GFX6-NEXT:    s_mov_b64 s[2:3], exec
22; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
23; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
24; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX6-NEXT:    ; implicit-def: $vgpr1
26; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
27; GFX6-NEXT:    s_cbranch_execz .LBB0_2
28; GFX6-NEXT:  ; %bb.1:
29; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
30; GFX6-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
31; GFX6-NEXT:    s_mul_i32 s2, s2, 5
32; GFX6-NEXT:    v_mov_b32_e32 v1, s2
33; GFX6-NEXT:    v_mov_b32_e32 v2, 0
34; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX6-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
36; GFX6-NEXT:  .LBB0_2:
37; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
38; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
39; GFX6-NEXT:    s_mov_b32 s3, 0xf000
40; GFX6-NEXT:    s_mov_b32 s2, -1
41; GFX6-NEXT:    s_waitcnt vmcnt(0)
42; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
43; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
44; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
45; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
46; GFX6-NEXT:    s_endpgm
47;
48; GFX8-LABEL: add_i32_constant:
49; GFX8:       ; %bb.0: ; %entry
50; GFX8-NEXT:    s_mov_b64 s[2:3], exec
51; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
52; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
53; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
54; GFX8-NEXT:    ; implicit-def: $vgpr1
55; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
56; GFX8-NEXT:    s_cbranch_execz .LBB0_2
57; GFX8-NEXT:  ; %bb.1:
58; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
59; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
60; GFX8-NEXT:    s_mul_i32 s2, s2, 5
61; GFX8-NEXT:    v_mov_b32_e32 v1, s2
62; GFX8-NEXT:    v_mov_b32_e32 v2, 0
63; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
64; GFX8-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
65; GFX8-NEXT:  .LBB0_2:
66; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
67; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
68; GFX8-NEXT:    s_waitcnt vmcnt(0)
69; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
70; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s2
71; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
72; GFX8-NEXT:    v_mov_b32_e32 v0, s0
73; GFX8-NEXT:    v_mov_b32_e32 v1, s1
74; GFX8-NEXT:    flat_store_dword v[0:1], v2
75; GFX8-NEXT:    s_endpgm
76;
77; GFX9-LABEL: add_i32_constant:
78; GFX9:       ; %bb.0: ; %entry
79; GFX9-NEXT:    s_mov_b64 s[2:3], exec
80; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
81; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
82; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
83; GFX9-NEXT:    ; implicit-def: $vgpr1
84; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
85; GFX9-NEXT:    s_cbranch_execz .LBB0_2
86; GFX9-NEXT:  ; %bb.1:
87; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
88; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
89; GFX9-NEXT:    s_mul_i32 s2, s2, 5
90; GFX9-NEXT:    v_mov_b32_e32 v1, s2
91; GFX9-NEXT:    v_mov_b32_e32 v2, 0
92; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
93; GFX9-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
94; GFX9-NEXT:  .LBB0_2:
95; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
96; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
97; GFX9-NEXT:    s_waitcnt vmcnt(0)
98; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
99; GFX9-NEXT:    v_mov_b32_e32 v2, 0
100; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
101; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
103; GFX9-NEXT:    s_endpgm
104;
105; GFX10W64-LABEL: add_i32_constant:
106; GFX10W64:       ; %bb.0: ; %entry
107; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
108; GFX10W64-NEXT:    ; implicit-def: $vgpr1
109; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
110; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
111; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
112; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
113; GFX10W64-NEXT:    s_cbranch_execz .LBB0_2
114; GFX10W64-NEXT:  ; %bb.1:
115; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
116; GFX10W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
117; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
118; GFX10W64-NEXT:    s_mul_i32 s2, s2, 5
119; GFX10W64-NEXT:    v_mov_b32_e32 v1, s2
120; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX10W64-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
122; GFX10W64-NEXT:  .LBB0_2:
123; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
124; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
125; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
126; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
127; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
128; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
129; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
130; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
131; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
132; GFX10W64-NEXT:    s_endpgm
133;
134; GFX10W32-LABEL: add_i32_constant:
135; GFX10W32:       ; %bb.0: ; %entry
136; GFX10W32-NEXT:    s_mov_b32 s1, exec_lo
137; GFX10W32-NEXT:    ; implicit-def: $vgpr1
138; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
139; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
140; GFX10W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
141; GFX10W32-NEXT:    s_cbranch_execz .LBB0_2
142; GFX10W32-NEXT:  ; %bb.1:
143; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
144; GFX10W32-NEXT:    s_bcnt1_i32_b32 s1, s1
145; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
146; GFX10W32-NEXT:    s_mul_i32 s1, s1, 5
147; GFX10W32-NEXT:    v_mov_b32_e32 v1, s1
148; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
149; GFX10W32-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
150; GFX10W32-NEXT:  .LBB0_2:
151; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
152; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
153; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
154; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
155; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
156; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
157; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
158; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
159; GFX10W32-NEXT:    global_store_dword v1, v0, s[0:1]
160; GFX10W32-NEXT:    s_endpgm
161;
162; GFX11W64-LABEL: add_i32_constant:
163; GFX11W64:       ; %bb.0: ; %entry
164; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
165; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
166; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
167; GFX11W64-NEXT:    ; implicit-def: $vgpr1
168; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
169; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
170; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
171; GFX11W64-NEXT:    s_cbranch_execz .LBB0_2
172; GFX11W64-NEXT:  ; %bb.1:
173; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
174; GFX11W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
175; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
176; GFX11W64-NEXT:    s_mul_i32 s2, s2, 5
177; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
178; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
179; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
180; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
181; GFX11W64-NEXT:  .LBB0_2:
182; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
183; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
184; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
185; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
186; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
187; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
188; GFX11W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
189; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
190; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
191; GFX11W64-NEXT:    s_endpgm
192;
193; GFX11W32-LABEL: add_i32_constant:
194; GFX11W32:       ; %bb.0: ; %entry
195; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
196; GFX11W32-NEXT:    s_mov_b32 s0, exec_lo
197; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
198; GFX11W32-NEXT:    ; implicit-def: $vgpr1
199; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
200; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
201; GFX11W32-NEXT:    s_cbranch_execz .LBB0_2
202; GFX11W32-NEXT:  ; %bb.1:
203; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
204; GFX11W32-NEXT:    s_bcnt1_i32_b32 s1, s1
205; GFX11W32-NEXT:    v_mov_b32_e32 v2, 0
206; GFX11W32-NEXT:    s_mul_i32 s1, s1, 5
207; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
208; GFX11W32-NEXT:    v_mov_b32_e32 v1, s1
209; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
211; GFX11W32-NEXT:  .LBB0_2:
212; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
213; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
214; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
215; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
216; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
217; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
218; GFX11W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
219; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
220; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
221; GFX11W32-NEXT:    s_endpgm
222;
223; GFX12W64-LABEL: add_i32_constant:
224; GFX12W64:       ; %bb.0: ; %entry
225; GFX12W64-NEXT:    s_mov_b64 s[2:3], exec
226; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
227; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
228; GFX12W64-NEXT:    ; implicit-def: $vgpr1
229; GFX12W64-NEXT:    s_wait_alu 0xfffe
230; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
231; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
232; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
233; GFX12W64-NEXT:    s_cbranch_execz .LBB0_2
234; GFX12W64-NEXT:  ; %bb.1:
235; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
236; GFX12W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
237; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
238; GFX12W64-NEXT:    s_wait_alu 0xfffe
239; GFX12W64-NEXT:    s_mul_i32 s2, s2, 5
240; GFX12W64-NEXT:    s_wait_alu 0xfffe
241; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
242; GFX12W64-NEXT:    s_wait_kmcnt 0x0
243; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
244; GFX12W64-NEXT:  .LBB0_2:
245; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
246; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
247; GFX12W64-NEXT:    s_wait_loadcnt 0x0
248; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
249; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
250; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
251; GFX12W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
252; GFX12W64-NEXT:    s_wait_kmcnt 0x0
253; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
254; GFX12W64-NEXT:    s_endpgm
255;
256; GFX12W32-LABEL: add_i32_constant:
257; GFX12W32:       ; %bb.0: ; %entry
258; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
259; GFX12W32-NEXT:    s_mov_b32 s0, exec_lo
260; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
261; GFX12W32-NEXT:    ; implicit-def: $vgpr1
262; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
263; GFX12W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
264; GFX12W32-NEXT:    s_cbranch_execz .LBB0_2
265; GFX12W32-NEXT:  ; %bb.1:
266; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
267; GFX12W32-NEXT:    s_wait_alu 0xfffe
268; GFX12W32-NEXT:    s_bcnt1_i32_b32 s1, s1
269; GFX12W32-NEXT:    s_wait_alu 0xfffe
270; GFX12W32-NEXT:    s_mul_i32 s1, s1, 5
271; GFX12W32-NEXT:    s_wait_alu 0xfffe
272; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
273; GFX12W32-NEXT:    s_wait_kmcnt 0x0
274; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
275; GFX12W32-NEXT:  .LBB0_2:
276; GFX12W32-NEXT:    s_wait_alu 0xfffe
277; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
278; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
279; GFX12W32-NEXT:    s_wait_loadcnt 0x0
280; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
281; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
282; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
283; GFX12W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
284; GFX12W32-NEXT:    s_wait_kmcnt 0x0
285; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
286; GFX12W32-NEXT:    s_endpgm
287entry:
288  %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0)
289  store i32 %old, ptr addrspace(1) %out
290  ret void
291}
292
293define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) {
294; GFX6-LABEL: add_i32_uniform:
295; GFX6:       ; %bb.0: ; %entry
296; GFX6-NEXT:    s_mov_b64 s[2:3], exec
297; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x11
298; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
299; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
300; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
301; GFX6-NEXT:    ; implicit-def: $vgpr1
302; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
303; GFX6-NEXT:    s_cbranch_execz .LBB1_2
304; GFX6-NEXT:  ; %bb.1:
305; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
306; GFX6-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
307; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
308; GFX6-NEXT:    s_mul_i32 s2, s6, s2
309; GFX6-NEXT:    v_mov_b32_e32 v1, s2
310; GFX6-NEXT:    v_mov_b32_e32 v2, 0
311; GFX6-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
312; GFX6-NEXT:  .LBB1_2:
313; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
314; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
315; GFX6-NEXT:    s_mov_b32 s3, 0xf000
316; GFX6-NEXT:    s_mov_b32 s2, -1
317; GFX6-NEXT:    s_waitcnt vmcnt(0)
318; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
319; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX6-NEXT:    v_mul_lo_u32 v0, s6, v0
321; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
322; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
323; GFX6-NEXT:    s_endpgm
324;
325; GFX8-LABEL: add_i32_uniform:
326; GFX8:       ; %bb.0: ; %entry
327; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x44
328; GFX8-NEXT:    s_mov_b64 s[2:3], exec
329; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
330; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
331; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
332; GFX8-NEXT:    ; implicit-def: $vgpr1
333; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
334; GFX8-NEXT:    s_cbranch_execz .LBB1_2
335; GFX8-NEXT:  ; %bb.1:
336; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
337; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
338; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
339; GFX8-NEXT:    s_mul_i32 s2, s6, s2
340; GFX8-NEXT:    v_mov_b32_e32 v1, s2
341; GFX8-NEXT:    v_mov_b32_e32 v2, 0
342; GFX8-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
343; GFX8-NEXT:  .LBB1_2:
344; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
345; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
346; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
347; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
348; GFX8-NEXT:    s_waitcnt vmcnt(0)
349; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
350; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v0
351; GFX8-NEXT:    v_mov_b32_e32 v0, s0
352; GFX8-NEXT:    v_mov_b32_e32 v1, s1
353; GFX8-NEXT:    flat_store_dword v[0:1], v2
354; GFX8-NEXT:    s_endpgm
355;
356; GFX9-LABEL: add_i32_uniform:
357; GFX9:       ; %bb.0: ; %entry
358; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x44
359; GFX9-NEXT:    s_mov_b64 s[2:3], exec
360; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
361; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
362; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
363; GFX9-NEXT:    ; implicit-def: $vgpr1
364; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
365; GFX9-NEXT:    s_cbranch_execz .LBB1_2
366; GFX9-NEXT:  ; %bb.1:
367; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
368; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
369; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
370; GFX9-NEXT:    s_mul_i32 s2, s6, s2
371; GFX9-NEXT:    v_mov_b32_e32 v1, s2
372; GFX9-NEXT:    v_mov_b32_e32 v2, 0
373; GFX9-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
374; GFX9-NEXT:  .LBB1_2:
375; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
376; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
377; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
378; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
379; GFX9-NEXT:    s_waitcnt vmcnt(0)
380; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
381; GFX9-NEXT:    v_mov_b32_e32 v2, 0
382; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
383; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
384; GFX9-NEXT:    s_endpgm
385;
386; GFX10W64-LABEL: add_i32_uniform:
387; GFX10W64:       ; %bb.0: ; %entry
388; GFX10W64-NEXT:    s_load_dword s6, s[4:5], 0x44
389; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
390; GFX10W64-NEXT:    ; implicit-def: $vgpr1
391; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
392; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
393; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
394; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
395; GFX10W64-NEXT:    s_cbranch_execz .LBB1_2
396; GFX10W64-NEXT:  ; %bb.1:
397; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
398; GFX10W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
399; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
400; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
401; GFX10W64-NEXT:    s_mul_i32 s2, s6, s2
402; GFX10W64-NEXT:    v_mov_b32_e32 v1, s2
403; GFX10W64-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
404; GFX10W64-NEXT:  .LBB1_2:
405; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
406; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
407; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
408; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
409; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
410; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
411; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
412; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
413; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
414; GFX10W64-NEXT:    s_endpgm
415;
416; GFX10W32-LABEL: add_i32_uniform:
417; GFX10W32:       ; %bb.0: ; %entry
418; GFX10W32-NEXT:    s_load_dword s0, s[4:5], 0x44
419; GFX10W32-NEXT:    s_mov_b32 s2, exec_lo
420; GFX10W32-NEXT:    ; implicit-def: $vgpr1
421; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
422; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
423; GFX10W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
424; GFX10W32-NEXT:    s_cbranch_execz .LBB1_2
425; GFX10W32-NEXT:  ; %bb.1:
426; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
427; GFX10W32-NEXT:    s_bcnt1_i32_b32 s2, s2
428; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
429; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
430; GFX10W32-NEXT:    s_mul_i32 s2, s0, s2
431; GFX10W32-NEXT:    v_mov_b32_e32 v1, s2
432; GFX10W32-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
433; GFX10W32-NEXT:  .LBB1_2:
434; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
435; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
436; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
437; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
438; GFX10W32-NEXT:    s_mov_b32 null, 0
439; GFX10W32-NEXT:    v_readfirstlane_b32 s4, v1
440; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
441; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5]
442; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
443; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
444; GFX10W32-NEXT:    s_endpgm
445;
446; GFX11W64-LABEL: add_i32_uniform:
447; GFX11W64:       ; %bb.0: ; %entry
448; GFX11W64-NEXT:    s_load_b32 s6, s[4:5], 0x44
449; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
450; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
451; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
452; GFX11W64-NEXT:    ; implicit-def: $vgpr1
453; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
454; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
455; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
456; GFX11W64-NEXT:    s_cbranch_execz .LBB1_2
457; GFX11W64-NEXT:  ; %bb.1:
458; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
459; GFX11W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
460; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
461; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
462; GFX11W64-NEXT:    s_mul_i32 s2, s6, s2
463; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
464; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
465; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
466; GFX11W64-NEXT:  .LBB1_2:
467; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
468; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
469; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
470; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
471; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
472; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
473; GFX11W64-NEXT:    v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
474; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
475; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
476; GFX11W64-NEXT:    s_endpgm
477;
478; GFX11W32-LABEL: add_i32_uniform:
479; GFX11W32:       ; %bb.0: ; %entry
480; GFX11W32-NEXT:    s_load_b32 s0, s[4:5], 0x44
481; GFX11W32-NEXT:    s_mov_b32 s2, exec_lo
482; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
483; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
484; GFX11W32-NEXT:    ; implicit-def: $vgpr1
485; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
486; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
487; GFX11W32-NEXT:    s_cbranch_execz .LBB1_2
488; GFX11W32-NEXT:  ; %bb.1:
489; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
490; GFX11W32-NEXT:    s_bcnt1_i32_b32 s2, s2
491; GFX11W32-NEXT:    v_mov_b32_e32 v2, 0
492; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
493; GFX11W32-NEXT:    s_mul_i32 s2, s0, s2
494; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
495; GFX11W32-NEXT:    v_mov_b32_e32 v1, s2
496; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
497; GFX11W32-NEXT:  .LBB1_2:
498; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
499; GFX11W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
500; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
501; GFX11W32-NEXT:    v_readfirstlane_b32 s4, v1
502; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
503; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
504; GFX11W32-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5]
505; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
506; GFX11W32-NEXT:    global_store_b32 v0, v1, s[2:3]
507; GFX11W32-NEXT:    s_endpgm
508;
509; GFX12W64-LABEL: add_i32_uniform:
510; GFX12W64:       ; %bb.0: ; %entry
511; GFX12W64-NEXT:    s_load_b32 s6, s[4:5], 0x44
512; GFX12W64-NEXT:    s_mov_b64 s[2:3], exec
513; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
514; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
515; GFX12W64-NEXT:    ; implicit-def: $vgpr1
516; GFX12W64-NEXT:    s_wait_alu 0xfffe
517; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
518; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
519; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
520; GFX12W64-NEXT:    s_cbranch_execz .LBB1_2
521; GFX12W64-NEXT:  ; %bb.1:
522; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
523; GFX12W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
524; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
525; GFX12W64-NEXT:    s_wait_kmcnt 0x0
526; GFX12W64-NEXT:    s_wait_alu 0xfffe
527; GFX12W64-NEXT:    s_mul_i32 s2, s6, s2
528; GFX12W64-NEXT:    s_wait_alu 0xfffe
529; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
530; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
531; GFX12W64-NEXT:  .LBB1_2:
532; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
533; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
534; GFX12W64-NEXT:    s_wait_loadcnt 0x0
535; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
536; GFX12W64-NEXT:    s_wait_kmcnt 0x0
537; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
538; GFX12W64-NEXT:    v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
539; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
540; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
541; GFX12W64-NEXT:    s_endpgm
542;
543; GFX12W32-LABEL: add_i32_uniform:
544; GFX12W32:       ; %bb.0: ; %entry
545; GFX12W32-NEXT:    s_load_b32 s0, s[4:5], 0x44
546; GFX12W32-NEXT:    s_mov_b32 s2, exec_lo
547; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
548; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
549; GFX12W32-NEXT:    ; implicit-def: $vgpr1
550; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
551; GFX12W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
552; GFX12W32-NEXT:    s_cbranch_execz .LBB1_2
553; GFX12W32-NEXT:  ; %bb.1:
554; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
555; GFX12W32-NEXT:    s_wait_alu 0xfffe
556; GFX12W32-NEXT:    s_bcnt1_i32_b32 s2, s2
557; GFX12W32-NEXT:    s_wait_kmcnt 0x0
558; GFX12W32-NEXT:    s_wait_alu 0xfffe
559; GFX12W32-NEXT:    s_mul_i32 s2, s0, s2
560; GFX12W32-NEXT:    s_wait_alu 0xfffe
561; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
562; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
563; GFX12W32-NEXT:  .LBB1_2:
564; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
565; GFX12W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
566; GFX12W32-NEXT:    s_wait_loadcnt 0x0
567; GFX12W32-NEXT:    v_readfirstlane_b32 s4, v1
568; GFX12W32-NEXT:    s_wait_kmcnt 0x0
569; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
570; GFX12W32-NEXT:    v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
571; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
572; GFX12W32-NEXT:    global_store_b32 v1, v0, s[2:3]
573; GFX12W32-NEXT:    s_endpgm
574entry:
575  %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0)
576  store i32 %old, ptr addrspace(1) %out
577  ret void
578}
579
580define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
581; GFX6-LABEL: add_i32_varying_vdata:
582; GFX6:       ; %bb.0: ; %entry
583; GFX6-NEXT:    s_mov_b64 s[0:1], exec
584; GFX6-NEXT:    s_mov_b32 s2, 0
585; GFX6-NEXT:    ; implicit-def: $vgpr1
586; GFX6-NEXT:  .LBB2_1: ; %ComputeLoop
587; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
588; GFX6-NEXT:    s_ff1_i32_b64 s3, s[0:1]
589; GFX6-NEXT:    s_mov_b32 m0, s3
590; GFX6-NEXT:    v_readlane_b32 s8, v0, s3
591; GFX6-NEXT:    v_writelane_b32 v1, s2, m0
592; GFX6-NEXT:    s_lshl_b64 s[6:7], 1, s3
593; GFX6-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
594; GFX6-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
595; GFX6-NEXT:    s_and_b64 vcc, exec, s[6:7]
596; GFX6-NEXT:    s_add_i32 s2, s2, s8
597; GFX6-NEXT:    s_cbranch_vccnz .LBB2_1
598; GFX6-NEXT:  ; %bb.2: ; %ComputeEnd
599; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
600; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
601; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
602; GFX6-NEXT:    ; implicit-def: $vgpr0
603; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
604; GFX6-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
605; GFX6-NEXT:    s_cbranch_execz .LBB2_4
606; GFX6-NEXT:  ; %bb.3:
607; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
608; GFX6-NEXT:    v_mov_b32_e32 v0, s2
609; GFX6-NEXT:    v_mov_b32_e32 v2, 0
610; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
611; GFX6-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
612; GFX6-NEXT:  .LBB2_4:
613; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
614; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
615; GFX6-NEXT:    s_mov_b32 s3, 0xf000
616; GFX6-NEXT:    s_mov_b32 s2, -1
617; GFX6-NEXT:    s_waitcnt vmcnt(0)
618; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
619; GFX6-NEXT:    s_waitcnt expcnt(0)
620; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v1
621; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
622; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
623; GFX6-NEXT:    s_endpgm
624;
625; GFX8-LABEL: add_i32_varying_vdata:
626; GFX8:       ; %bb.0: ; %entry
627; GFX8-NEXT:    s_mov_b64 s[0:1], exec
628; GFX8-NEXT:    s_mov_b32 s2, 0
629; GFX8-NEXT:    ; implicit-def: $vgpr1
630; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
631; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
632; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
633; GFX8-NEXT:    s_mov_b32 m0, s3
634; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
635; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
636; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
637; GFX8-NEXT:    s_add_i32 s2, s2, s8
638; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
639; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
640; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
641; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
642; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
643; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
644; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
645; GFX8-NEXT:    ; implicit-def: $vgpr0
646; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
647; GFX8-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
648; GFX8-NEXT:    s_cbranch_execz .LBB2_4
649; GFX8-NEXT:  ; %bb.3:
650; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
651; GFX8-NEXT:    v_mov_b32_e32 v0, s2
652; GFX8-NEXT:    v_mov_b32_e32 v2, 0
653; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
654; GFX8-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
655; GFX8-NEXT:  .LBB2_4:
656; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
657; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
658; GFX8-NEXT:    s_waitcnt vmcnt(0)
659; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
660; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v1
661; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
662; GFX8-NEXT:    v_mov_b32_e32 v0, s0
663; GFX8-NEXT:    v_mov_b32_e32 v1, s1
664; GFX8-NEXT:    flat_store_dword v[0:1], v2
665; GFX8-NEXT:    s_endpgm
666;
667; GFX9-LABEL: add_i32_varying_vdata:
668; GFX9:       ; %bb.0: ; %entry
669; GFX9-NEXT:    s_mov_b64 s[0:1], exec
670; GFX9-NEXT:    s_mov_b32 s2, 0
671; GFX9-NEXT:    ; implicit-def: $vgpr1
672; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
673; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
674; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
675; GFX9-NEXT:    s_mov_b32 m0, s3
676; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
677; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
678; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
679; GFX9-NEXT:    s_add_i32 s2, s2, s8
680; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
681; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
682; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
683; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
684; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
685; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
686; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
687; GFX9-NEXT:    ; implicit-def: $vgpr0
688; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
689; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
690; GFX9-NEXT:    s_cbranch_execz .LBB2_4
691; GFX9-NEXT:  ; %bb.3:
692; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
693; GFX9-NEXT:    v_mov_b32_e32 v0, s2
694; GFX9-NEXT:    v_mov_b32_e32 v2, 0
695; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
696; GFX9-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
697; GFX9-NEXT:  .LBB2_4:
698; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
699; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
700; GFX9-NEXT:    s_waitcnt vmcnt(0)
701; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
702; GFX9-NEXT:    v_mov_b32_e32 v2, 0
703; GFX9-NEXT:    v_add_u32_e32 v0, s2, v1
704; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
705; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
706; GFX9-NEXT:    s_endpgm
707;
708; GFX10W64-LABEL: add_i32_varying_vdata:
709; GFX10W64:       ; %bb.0: ; %entry
710; GFX10W64-NEXT:    s_mov_b64 s[0:1], exec
711; GFX10W64-NEXT:    s_mov_b32 s2, 0
712; GFX10W64-NEXT:    ; implicit-def: $vgpr1
713; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
714; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
715; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
716; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
717; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
718; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
719; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
720; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
721; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
722; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
723; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
724; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
725; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
726; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
727; GFX10W64-NEXT:    ; implicit-def: $vgpr0
728; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
729; GFX10W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
730; GFX10W64-NEXT:    s_cbranch_execz .LBB2_4
731; GFX10W64-NEXT:  ; %bb.3:
732; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
733; GFX10W64-NEXT:    v_mov_b32_e32 v0, s2
734; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
735; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
736; GFX10W64-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
737; GFX10W64-NEXT:  .LBB2_4:
738; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
739; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
740; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
741; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
742; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
743; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
744; GFX10W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
745; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
746; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
747; GFX10W64-NEXT:    s_endpgm
748;
749; GFX10W32-LABEL: add_i32_varying_vdata:
750; GFX10W32:       ; %bb.0: ; %entry
751; GFX10W32-NEXT:    s_mov_b32 s1, exec_lo
752; GFX10W32-NEXT:    s_mov_b32 s0, 0
753; GFX10W32-NEXT:    ; implicit-def: $vgpr1
754; GFX10W32-NEXT:  .LBB2_1: ; %ComputeLoop
755; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
756; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
757; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
758; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
759; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
760; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
761; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
762; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
763; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
764; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
765; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
766; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
767; GFX10W32-NEXT:    ; implicit-def: $vgpr0
768; GFX10W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
769; GFX10W32-NEXT:    s_xor_b32 s1, exec_lo, s1
770; GFX10W32-NEXT:    s_cbranch_execz .LBB2_4
771; GFX10W32-NEXT:  ; %bb.3:
772; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
773; GFX10W32-NEXT:    v_mov_b32_e32 v0, s0
774; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
775; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
776; GFX10W32-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
777; GFX10W32-NEXT:  .LBB2_4:
778; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
779; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
780; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
781; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
782; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
783; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
784; GFX10W32-NEXT:    v_add_nc_u32_e32 v1, s2, v1
785; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
786; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
787; GFX10W32-NEXT:    s_endpgm
788;
789; GFX11W64-LABEL: add_i32_varying_vdata:
790; GFX11W64:       ; %bb.0: ; %entry
791; GFX11W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
792; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
793; GFX11W64-NEXT:    s_mov_b32 s2, 0
794; GFX11W64-NEXT:    ; implicit-def: $vgpr0
795; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
796; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
797; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
798; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
799; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
800; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
801; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
802; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
803; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
804; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
805; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
806; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
807; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
808; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
809; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
810; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
811; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
812; GFX11W64-NEXT:    ; implicit-def: $vgpr1
813; GFX11W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
814; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
815; GFX11W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
816; GFX11W64-NEXT:    s_cbranch_execz .LBB2_4
817; GFX11W64-NEXT:  ; %bb.3:
818; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
819; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
820; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
821; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
822; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
823; GFX11W64-NEXT:  .LBB2_4:
824; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
825; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
826; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
827; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
828; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
829; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
830; GFX11W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
831; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
832; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
833; GFX11W64-NEXT:    s_endpgm
834;
835; GFX11W32-LABEL: add_i32_varying_vdata:
836; GFX11W32:       ; %bb.0: ; %entry
837; GFX11W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
838; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
839; GFX11W32-NEXT:    s_mov_b32 s0, 0
840; GFX11W32-NEXT:    ; implicit-def: $vgpr0
841; GFX11W32-NEXT:  .LBB2_1: ; %ComputeLoop
842; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
843; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
844; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
845; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
846; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
847; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
848; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
849; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
850; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
851; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
852; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
853; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
854; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
855; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
856; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
857; GFX11W32-NEXT:    ; implicit-def: $vgpr1
858; GFX11W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
859; GFX11W32-NEXT:    s_xor_b32 s1, exec_lo, s1
860; GFX11W32-NEXT:    s_cbranch_execz .LBB2_4
861; GFX11W32-NEXT:  ; %bb.3:
862; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
863; GFX11W32-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
864; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
865; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
866; GFX11W32-NEXT:  .LBB2_4:
867; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
868; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
869; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
870; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
871; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
872; GFX11W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
873; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
874; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
875; GFX11W32-NEXT:    s_endpgm
876;
877; GFX12W64-LABEL: add_i32_varying_vdata:
878; GFX12W64:       ; %bb.0: ; %entry
879; GFX12W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
880; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
881; GFX12W64-NEXT:    s_mov_b32 s2, 0
882; GFX12W64-NEXT:    ; implicit-def: $vgpr0
883; GFX12W64-NEXT:  .LBB2_1: ; %ComputeLoop
884; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
885; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
886; GFX12W64-NEXT:    s_wait_alu 0xfffe
887; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
888; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
889; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
890; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
891; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
892; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
893; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
894; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
895; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
896; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
897; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
898; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
899; GFX12W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
900; GFX12W64-NEXT:    ; implicit-def: $vgpr1
901; GFX12W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
902; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
903; GFX12W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
904; GFX12W64-NEXT:    s_cbranch_execz .LBB2_4
905; GFX12W64-NEXT:  ; %bb.3:
906; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
907; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
908; GFX12W64-NEXT:    s_wait_alu 0xfffe
909; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
910; GFX12W64-NEXT:    s_wait_kmcnt 0x0
911; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
912; GFX12W64-NEXT:  .LBB2_4:
913; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
914; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
915; GFX12W64-NEXT:    s_wait_loadcnt 0x0
916; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
917; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
918; GFX12W64-NEXT:    s_wait_alu 0xfffe
919; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
920; GFX12W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
921; GFX12W64-NEXT:    s_wait_kmcnt 0x0
922; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
923; GFX12W64-NEXT:    s_endpgm
924;
925; GFX12W32-LABEL: add_i32_varying_vdata:
926; GFX12W32:       ; %bb.0: ; %entry
927; GFX12W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
928; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
929; GFX12W32-NEXT:    s_mov_b32 s0, 0
930; GFX12W32-NEXT:    ; implicit-def: $vgpr0
931; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
932; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
933; GFX12W32-NEXT:    s_wait_alu 0xfffe
934; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
935; GFX12W32-NEXT:    s_wait_alu 0xfffe
936; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
937; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
938; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
939; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
940; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
941; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
942; GFX12W32-NEXT:    s_wait_alu 0xfffe
943; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
944; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
945; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
946; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
947; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
948; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
949; GFX12W32-NEXT:    ; implicit-def: $vgpr1
950; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
951; GFX12W32-NEXT:    s_wait_alu 0xfffe
952; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
953; GFX12W32-NEXT:    s_cbranch_execz .LBB2_4
954; GFX12W32-NEXT:  ; %bb.3:
955; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
956; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
957; GFX12W32-NEXT:    s_wait_kmcnt 0x0
958; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
959; GFX12W32-NEXT:  .LBB2_4:
960; GFX12W32-NEXT:    s_wait_alu 0xfffe
961; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
962; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
963; GFX12W32-NEXT:    s_wait_loadcnt 0x0
964; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
965; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
966; GFX12W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
967; GFX12W32-NEXT:    s_wait_kmcnt 0x0
968; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
969; GFX12W32-NEXT:    s_endpgm
970entry:
971  %lane = call i32 @llvm.amdgcn.workitem.id.x()
972  %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0)
973  store i32 %old, ptr addrspace(1) %out
974  ret void
975}
976
977define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
978; GFX6-LABEL: add_i32_varying_vindex:
979; GFX6:       ; %bb.0: ; %entry
980; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
981; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
982; GFX6-NEXT:    v_mov_b32_e32 v1, 1
983; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
984; GFX6-NEXT:    buffer_atomic_add v1, v0, s[0:3], 0 idxen glc
985; GFX6-NEXT:    s_mov_b32 s7, 0xf000
986; GFX6-NEXT:    s_mov_b32 s6, -1
987; GFX6-NEXT:    s_waitcnt vmcnt(0)
988; GFX6-NEXT:    buffer_store_dword v1, off, s[4:7], 0
989; GFX6-NEXT:    s_endpgm
990;
991; GFX8-LABEL: add_i32_varying_vindex:
992; GFX8:       ; %bb.0: ; %entry
993; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
994; GFX8-NEXT:    v_mov_b32_e32 v2, 1
995; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
996; GFX8-NEXT:    buffer_atomic_add v2, v0, s[0:3], 0 idxen glc
997; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
998; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
999; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1000; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1001; GFX8-NEXT:    s_waitcnt vmcnt(0)
1002; GFX8-NEXT:    flat_store_dword v[0:1], v2
1003; GFX8-NEXT:    s_endpgm
1004;
1005; GFX9-LABEL: add_i32_varying_vindex:
1006; GFX9:       ; %bb.0: ; %entry
1007; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
1008; GFX9-NEXT:    v_mov_b32_e32 v1, 1
1009; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1010; GFX9-NEXT:    buffer_atomic_add v1, v0, s[0:3], 0 idxen glc
1011; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1012; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1013; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1014; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1015; GFX9-NEXT:    s_endpgm
1016;
1017; GFX10-LABEL: add_i32_varying_vindex:
1018; GFX10:       ; %bb.0: ; %entry
1019; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
1020; GFX10-NEXT:    v_mov_b32_e32 v1, 1
1021; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1022; GFX10-NEXT:    buffer_atomic_add v1, v0, s[0:3], 0 idxen glc
1023; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
1024; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1025; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1026; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1027; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1028; GFX10-NEXT:    s_endpgm
1029;
1030; GFX11W64-LABEL: add_i32_varying_vindex:
1031; GFX11W64:       ; %bb.0: ; %entry
1032; GFX11W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1033; GFX11W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1034; GFX11W64-NEXT:    v_mov_b32_e32 v1, 1
1035; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1036; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, v0, s[0:3], 0 idxen glc
1037; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1038; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
1039; GFX11W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1040; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
1041; GFX11W64-NEXT:    s_endpgm
1042;
1043; GFX11W32-LABEL: add_i32_varying_vindex:
1044; GFX11W32:       ; %bb.0: ; %entry
1045; GFX11W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1046; GFX11W32-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
1047; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1048; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, v0, s[0:3], 0 idxen glc
1049; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1050; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
1051; GFX11W32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1052; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
1053; GFX11W32-NEXT:    s_endpgm
1054;
1055; GFX12W64-LABEL: add_i32_varying_vindex:
1056; GFX12W64:       ; %bb.0: ; %entry
1057; GFX12W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1058; GFX12W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1059; GFX12W64-NEXT:    v_mov_b32_e32 v1, 1
1060; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1061; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN
1062; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1063; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
1064; GFX12W64-NEXT:    s_wait_loadcnt 0x0
1065; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1066; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
1067; GFX12W64-NEXT:    s_endpgm
1068;
1069; GFX12W32-LABEL: add_i32_varying_vindex:
1070; GFX12W32:       ; %bb.0: ; %entry
1071; GFX12W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1072; GFX12W32-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
1073; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1074; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN
1075; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1076; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
1077; GFX12W32-NEXT:    s_wait_loadcnt 0x0
1078; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1079; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
1080; GFX12W32-NEXT:    s_endpgm
1081entry:
1082  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1083  %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0, i32 0)
1084  store i32 %old, ptr addrspace(1) %out
1085  ret void
1086}
1087
1088define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
1089; GFX6-LABEL: add_i32_varying_offset:
1090; GFX6:       ; %bb.0: ; %entry
1091; GFX6-NEXT:    v_mov_b32_e32 v1, v0
1092; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
1093; GFX6-NEXT:    s_mov_b32 s6, 0
1094; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
1095; GFX6-NEXT:    v_mov_b32_e32 v0, s6
1096; GFX6-NEXT:    v_mov_b32_e32 v2, 1
1097; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1098; GFX6-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 idxen offen glc
1099; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1100; GFX6-NEXT:    s_mov_b32 s6, -1
1101; GFX6-NEXT:    s_waitcnt vmcnt(0)
1102; GFX6-NEXT:    buffer_store_dword v2, off, s[4:7], 0
1103; GFX6-NEXT:    s_endpgm
1104;
1105; GFX8-LABEL: add_i32_varying_offset:
1106; GFX8:       ; %bb.0: ; %entry
1107; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
1108; GFX8-NEXT:    s_mov_b32 s6, 0
1109; GFX8-NEXT:    v_mov_b32_e32 v1, v0
1110; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1111; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1112; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1113; GFX8-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 idxen offen glc
1114; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1115; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1116; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1117; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1118; GFX8-NEXT:    s_waitcnt vmcnt(0)
1119; GFX8-NEXT:    flat_store_dword v[0:1], v2
1120; GFX8-NEXT:    s_endpgm
1121;
1122; GFX9-LABEL: add_i32_varying_offset:
1123; GFX9:       ; %bb.0: ; %entry
1124; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
1125; GFX9-NEXT:    s_mov_b32 s6, 0
1126; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1127; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1128; GFX9-NEXT:    v_mov_b32_e32 v2, 1
1129; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1130; GFX9-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 idxen offen glc
1131; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1132; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1133; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1134; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
1135; GFX9-NEXT:    s_endpgm
1136;
1137; GFX10-LABEL: add_i32_varying_offset:
1138; GFX10:       ; %bb.0: ; %entry
1139; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
1140; GFX10-NEXT:    s_mov_b32 s6, 0
1141; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1142; GFX10-NEXT:    v_mov_b32_e32 v0, s6
1143; GFX10-NEXT:    v_mov_b32_e32 v2, 1
1144; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1145; GFX10-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 idxen offen glc
1146; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
1147; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1148; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1149; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1150; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
1151; GFX10-NEXT:    s_endpgm
1152;
1153; GFX11W64-LABEL: add_i32_varying_offset:
1154; GFX11W64:       ; %bb.0: ; %entry
1155; GFX11W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1156; GFX11W64-NEXT:    s_mov_b32 s6, 0
1157; GFX11W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
1158; GFX11W64-NEXT:    v_mov_b32_e32 v0, s6
1159; GFX11W64-NEXT:    v_mov_b32_e32 v2, 1
1160; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1161; GFX11W64-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
1162; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1163; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
1164; GFX11W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1165; GFX11W64-NEXT:    global_store_b32 v0, v2, s[0:1]
1166; GFX11W64-NEXT:    s_endpgm
1167;
1168; GFX11W32-LABEL: add_i32_varying_offset:
1169; GFX11W32:       ; %bb.0: ; %entry
1170; GFX11W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1171; GFX11W32-NEXT:    s_mov_b32 s6, 0
1172; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1173; GFX11W32-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_and_b32 v1, 0x3ff, v0
1174; GFX11W32-NEXT:    v_mov_b32_e32 v2, 1
1175; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1176; GFX11W32-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
1177; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1178; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
1179; GFX11W32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1180; GFX11W32-NEXT:    global_store_b32 v0, v2, s[0:1]
1181; GFX11W32-NEXT:    s_endpgm
1182;
1183; GFX12W64-LABEL: add_i32_varying_offset:
1184; GFX12W64:       ; %bb.0: ; %entry
1185; GFX12W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1186; GFX12W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
1187; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
1188; GFX12W64-NEXT:    v_mov_b32_e32 v2, 1
1189; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1190; GFX12W64-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
1191; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1192; GFX12W64-NEXT:    s_wait_loadcnt 0x0
1193; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1194; GFX12W64-NEXT:    global_store_b32 v0, v2, s[0:1]
1195; GFX12W64-NEXT:    s_endpgm
1196;
1197; GFX12W32-LABEL: add_i32_varying_offset:
1198; GFX12W32:       ; %bb.0: ; %entry
1199; GFX12W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1200; GFX12W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0
1201; GFX12W32-NEXT:    v_mov_b32_e32 v2, 1
1202; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1203; GFX12W32-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
1204; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1205; GFX12W32-NEXT:    s_wait_loadcnt 0x0
1206; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1207; GFX12W32-NEXT:    global_store_b32 v0, v2, s[0:1]
1208; GFX12W32-NEXT:    s_endpgm
1209entry:
1210  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1211  %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 0, i32 %lane, i32 0, i32 0)
1212  store i32 %old, ptr addrspace(1) %out
1213  ret void
1214}
1215
1216define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
1217; GFX6-LABEL: sub_i32_constant:
1218; GFX6:       ; %bb.0: ; %entry
1219; GFX6-NEXT:    s_mov_b64 s[2:3], exec
1220; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1221; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1222; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1223; GFX6-NEXT:    ; implicit-def: $vgpr1
1224; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1225; GFX6-NEXT:    s_cbranch_execz .LBB5_2
1226; GFX6-NEXT:  ; %bb.1:
1227; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
1228; GFX6-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1229; GFX6-NEXT:    s_mul_i32 s2, s2, 5
1230; GFX6-NEXT:    v_mov_b32_e32 v1, s2
1231; GFX6-NEXT:    v_mov_b32_e32 v2, 0
1232; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1233; GFX6-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1234; GFX6-NEXT:  .LBB5_2:
1235; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
1236; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1237; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1238; GFX6-NEXT:    s_mov_b32 s2, -1
1239; GFX6-NEXT:    s_waitcnt vmcnt(0)
1240; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
1241; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1242; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1243; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1244; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1245; GFX6-NEXT:    s_endpgm
1246;
1247; GFX8-LABEL: sub_i32_constant:
1248; GFX8:       ; %bb.0: ; %entry
1249; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1250; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1251; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1252; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1253; GFX8-NEXT:    ; implicit-def: $vgpr1
1254; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1255; GFX8-NEXT:    s_cbranch_execz .LBB5_2
1256; GFX8-NEXT:  ; %bb.1:
1257; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1258; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1259; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1260; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1261; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1262; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1263; GFX8-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1264; GFX8-NEXT:  .LBB5_2:
1265; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1266; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1267; GFX8-NEXT:    s_waitcnt vmcnt(0)
1268; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1269; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1270; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v0
1271; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1272; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1273; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1274; GFX8-NEXT:    flat_store_dword v[0:1], v2
1275; GFX8-NEXT:    s_endpgm
1276;
1277; GFX9-LABEL: sub_i32_constant:
1278; GFX9:       ; %bb.0: ; %entry
1279; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1280; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1281; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1282; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1283; GFX9-NEXT:    ; implicit-def: $vgpr1
1284; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1285; GFX9-NEXT:    s_cbranch_execz .LBB5_2
1286; GFX9-NEXT:  ; %bb.1:
1287; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1288; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1289; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1290; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1291; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1292; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1293; GFX9-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1294; GFX9-NEXT:  .LBB5_2:
1295; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1296; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1297; GFX9-NEXT:    s_waitcnt vmcnt(0)
1298; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1299; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1300; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1301; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1302; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1303; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
1304; GFX9-NEXT:    s_endpgm
1305;
1306; GFX10W64-LABEL: sub_i32_constant:
1307; GFX10W64:       ; %bb.0: ; %entry
1308; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
1309; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1310; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1311; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1312; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1313; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1314; GFX10W64-NEXT:    s_cbranch_execz .LBB5_2
1315; GFX10W64-NEXT:  ; %bb.1:
1316; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1317; GFX10W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1318; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
1319; GFX10W64-NEXT:    s_mul_i32 s2, s2, 5
1320; GFX10W64-NEXT:    v_mov_b32_e32 v1, s2
1321; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1322; GFX10W64-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1323; GFX10W64-NEXT:  .LBB5_2:
1324; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1325; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1326; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1327; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1328; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
1329; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1330; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1331; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1332; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1333; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
1334; GFX10W64-NEXT:    s_endpgm
1335;
1336; GFX10W32-LABEL: sub_i32_constant:
1337; GFX10W32:       ; %bb.0: ; %entry
1338; GFX10W32-NEXT:    s_mov_b32 s1, exec_lo
1339; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1340; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1341; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1342; GFX10W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1343; GFX10W32-NEXT:    s_cbranch_execz .LBB5_2
1344; GFX10W32-NEXT:  ; %bb.1:
1345; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1346; GFX10W32-NEXT:    s_bcnt1_i32_b32 s1, s1
1347; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
1348; GFX10W32-NEXT:    s_mul_i32 s1, s1, 5
1349; GFX10W32-NEXT:    v_mov_b32_e32 v1, s1
1350; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1351; GFX10W32-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1352; GFX10W32-NEXT:  .LBB5_2:
1353; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1354; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1355; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1356; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1357; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
1358; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1359; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1360; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1361; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1362; GFX10W32-NEXT:    global_store_dword v1, v0, s[0:1]
1363; GFX10W32-NEXT:    s_endpgm
1364;
1365; GFX11W64-LABEL: sub_i32_constant:
1366; GFX11W64:       ; %bb.0: ; %entry
1367; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
1368; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
1369; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1370; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1371; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1372; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1373; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1374; GFX11W64-NEXT:    s_cbranch_execz .LBB5_2
1375; GFX11W64-NEXT:  ; %bb.1:
1376; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1377; GFX11W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1378; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
1379; GFX11W64-NEXT:    s_mul_i32 s2, s2, 5
1380; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1381; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
1382; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1383; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
1384; GFX11W64-NEXT:  .LBB5_2:
1385; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1386; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1387; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1388; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
1389; GFX11W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1390; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1391; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1392; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1393; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1394; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1395; GFX11W64-NEXT:    s_endpgm
1396;
1397; GFX11W32-LABEL: sub_i32_constant:
1398; GFX11W32:       ; %bb.0: ; %entry
1399; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
1400; GFX11W32-NEXT:    s_mov_b32 s0, exec_lo
1401; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1402; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1403; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1404; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1405; GFX11W32-NEXT:    s_cbranch_execz .LBB5_2
1406; GFX11W32-NEXT:  ; %bb.1:
1407; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1408; GFX11W32-NEXT:    s_bcnt1_i32_b32 s1, s1
1409; GFX11W32-NEXT:    v_mov_b32_e32 v2, 0
1410; GFX11W32-NEXT:    s_mul_i32 s1, s1, 5
1411; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1412; GFX11W32-NEXT:    v_mov_b32_e32 v1, s1
1413; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1414; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
1415; GFX11W32-NEXT:  .LBB5_2:
1416; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1417; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1418; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1419; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
1420; GFX11W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1421; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1422; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1423; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1424; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1425; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
1426; GFX11W32-NEXT:    s_endpgm
1427;
1428; GFX12W64-LABEL: sub_i32_constant:
1429; GFX12W64:       ; %bb.0: ; %entry
1430; GFX12W64-NEXT:    s_mov_b64 s[2:3], exec
1431; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
1432; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1433; GFX12W64-NEXT:    ; implicit-def: $vgpr1
1434; GFX12W64-NEXT:    s_wait_alu 0xfffe
1435; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1436; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1437; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1438; GFX12W64-NEXT:    s_cbranch_execz .LBB5_2
1439; GFX12W64-NEXT:  ; %bb.1:
1440; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1441; GFX12W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1442; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
1443; GFX12W64-NEXT:    s_wait_alu 0xfffe
1444; GFX12W64-NEXT:    s_mul_i32 s2, s2, 5
1445; GFX12W64-NEXT:    s_wait_alu 0xfffe
1446; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
1447; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1448; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
1449; GFX12W64-NEXT:  .LBB5_2:
1450; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1451; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1452; GFX12W64-NEXT:    s_wait_loadcnt 0x0
1453; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
1454; GFX12W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1455; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
1456; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1457; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1458; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1459; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1460; GFX12W64-NEXT:    s_endpgm
1461;
1462; GFX12W32-LABEL: sub_i32_constant:
1463; GFX12W32:       ; %bb.0: ; %entry
1464; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
1465; GFX12W32-NEXT:    s_mov_b32 s0, exec_lo
1466; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1467; GFX12W32-NEXT:    ; implicit-def: $vgpr1
1468; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1469; GFX12W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1470; GFX12W32-NEXT:    s_cbranch_execz .LBB5_2
1471; GFX12W32-NEXT:  ; %bb.1:
1472; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1473; GFX12W32-NEXT:    s_wait_alu 0xfffe
1474; GFX12W32-NEXT:    s_bcnt1_i32_b32 s1, s1
1475; GFX12W32-NEXT:    s_wait_alu 0xfffe
1476; GFX12W32-NEXT:    s_mul_i32 s1, s1, 5
1477; GFX12W32-NEXT:    s_wait_alu 0xfffe
1478; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
1479; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1480; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
1481; GFX12W32-NEXT:  .LBB5_2:
1482; GFX12W32-NEXT:    s_wait_alu 0xfffe
1483; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1484; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1485; GFX12W32-NEXT:    s_wait_loadcnt 0x0
1486; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
1487; GFX12W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1488; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
1489; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1490; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1491; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1492; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
1493; GFX12W32-NEXT:    s_endpgm
1494entry:
1495  %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0)
1496  store i32 %old, ptr addrspace(1) %out
1497  ret void
1498}
1499
1500define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) {
1501; GFX6-LABEL: sub_i32_uniform:
1502; GFX6:       ; %bb.0: ; %entry
1503; GFX6-NEXT:    s_mov_b64 s[2:3], exec
1504; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x11
1505; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1506; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1507; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1508; GFX6-NEXT:    ; implicit-def: $vgpr1
1509; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1510; GFX6-NEXT:    s_cbranch_execz .LBB6_2
1511; GFX6-NEXT:  ; %bb.1:
1512; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
1513; GFX6-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1514; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1515; GFX6-NEXT:    s_mul_i32 s2, s6, s2
1516; GFX6-NEXT:    v_mov_b32_e32 v1, s2
1517; GFX6-NEXT:    v_mov_b32_e32 v2, 0
1518; GFX6-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1519; GFX6-NEXT:  .LBB6_2:
1520; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
1521; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1522; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1523; GFX6-NEXT:    s_mov_b32 s2, -1
1524; GFX6-NEXT:    s_waitcnt vmcnt(0)
1525; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
1526; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1527; GFX6-NEXT:    v_mul_lo_u32 v0, s6, v0
1528; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1529; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1530; GFX6-NEXT:    s_endpgm
1531;
1532; GFX8-LABEL: sub_i32_uniform:
1533; GFX8:       ; %bb.0: ; %entry
1534; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x44
1535; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1536; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1537; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1538; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1539; GFX8-NEXT:    ; implicit-def: $vgpr1
1540; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1541; GFX8-NEXT:    s_cbranch_execz .LBB6_2
1542; GFX8-NEXT:  ; %bb.1:
1543; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1544; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1545; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1546; GFX8-NEXT:    s_mul_i32 s2, s6, s2
1547; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1548; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1549; GFX8-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1550; GFX8-NEXT:  .LBB6_2:
1551; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1552; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1553; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1554; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
1555; GFX8-NEXT:    s_waitcnt vmcnt(0)
1556; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1557; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v0
1558; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1559; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1560; GFX8-NEXT:    flat_store_dword v[0:1], v2
1561; GFX8-NEXT:    s_endpgm
1562;
1563; GFX9-LABEL: sub_i32_uniform:
1564; GFX9:       ; %bb.0: ; %entry
1565; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x44
1566; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1567; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1568; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1569; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1570; GFX9-NEXT:    ; implicit-def: $vgpr1
1571; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1572; GFX9-NEXT:    s_cbranch_execz .LBB6_2
1573; GFX9-NEXT:  ; %bb.1:
1574; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1575; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1576; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1577; GFX9-NEXT:    s_mul_i32 s2, s6, s2
1578; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1579; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1580; GFX9-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1581; GFX9-NEXT:  .LBB6_2:
1582; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1583; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1584; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1585; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
1586; GFX9-NEXT:    s_waitcnt vmcnt(0)
1587; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1588; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1589; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1590; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
1591; GFX9-NEXT:    s_endpgm
1592;
1593; GFX10W64-LABEL: sub_i32_uniform:
1594; GFX10W64:       ; %bb.0: ; %entry
1595; GFX10W64-NEXT:    s_load_dword s6, s[4:5], 0x44
1596; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
1597; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1598; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1599; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1600; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1601; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1602; GFX10W64-NEXT:    s_cbranch_execz .LBB6_2
1603; GFX10W64-NEXT:  ; %bb.1:
1604; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1605; GFX10W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1606; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
1607; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1608; GFX10W64-NEXT:    s_mul_i32 s2, s6, s2
1609; GFX10W64-NEXT:    v_mov_b32_e32 v1, s2
1610; GFX10W64-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1611; GFX10W64-NEXT:  .LBB6_2:
1612; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1613; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1614; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1615; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1616; GFX10W64-NEXT:    v_mul_lo_u32 v0, s6, v0
1617; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1618; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
1619; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1620; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1621; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
1622; GFX10W64-NEXT:    s_endpgm
1623;
1624; GFX10W32-LABEL: sub_i32_uniform:
1625; GFX10W32:       ; %bb.0: ; %entry
1626; GFX10W32-NEXT:    s_load_dword s0, s[4:5], 0x44
1627; GFX10W32-NEXT:    s_mov_b32 s2, exec_lo
1628; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1629; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1630; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1631; GFX10W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1632; GFX10W32-NEXT:    s_cbranch_execz .LBB6_2
1633; GFX10W32-NEXT:  ; %bb.1:
1634; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1635; GFX10W32-NEXT:    s_bcnt1_i32_b32 s2, s2
1636; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
1637; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1638; GFX10W32-NEXT:    s_mul_i32 s2, s0, s2
1639; GFX10W32-NEXT:    v_mov_b32_e32 v1, s2
1640; GFX10W32-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1641; GFX10W32-NEXT:  .LBB6_2:
1642; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1643; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1644; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1645; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1646; GFX10W32-NEXT:    v_mul_lo_u32 v0, s0, v0
1647; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1648; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
1649; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1650; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1651; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
1652; GFX10W32-NEXT:    s_endpgm
1653;
1654; GFX11W64-LABEL: sub_i32_uniform:
1655; GFX11W64:       ; %bb.0: ; %entry
1656; GFX11W64-NEXT:    s_load_b32 s6, s[4:5], 0x44
1657; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
1658; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
1659; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1660; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1661; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1662; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1663; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1664; GFX11W64-NEXT:    s_cbranch_execz .LBB6_2
1665; GFX11W64-NEXT:  ; %bb.1:
1666; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1667; GFX11W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1668; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
1669; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1670; GFX11W64-NEXT:    s_mul_i32 s2, s6, s2
1671; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1672; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
1673; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
1674; GFX11W64-NEXT:  .LBB6_2:
1675; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1676; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1677; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1678; GFX11W64-NEXT:    v_mul_lo_u32 v0, s6, v0
1679; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1680; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
1681; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1682; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1683; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1684; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1685; GFX11W64-NEXT:    s_endpgm
1686;
1687; GFX11W32-LABEL: sub_i32_uniform:
1688; GFX11W32:       ; %bb.0: ; %entry
1689; GFX11W32-NEXT:    s_load_b32 s0, s[4:5], 0x44
1690; GFX11W32-NEXT:    s_mov_b32 s2, exec_lo
1691; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
1692; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1693; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1694; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1695; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1696; GFX11W32-NEXT:    s_cbranch_execz .LBB6_2
1697; GFX11W32-NEXT:  ; %bb.1:
1698; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1699; GFX11W32-NEXT:    s_bcnt1_i32_b32 s2, s2
1700; GFX11W32-NEXT:    v_mov_b32_e32 v2, 0
1701; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1702; GFX11W32-NEXT:    s_mul_i32 s2, s0, s2
1703; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1704; GFX11W32-NEXT:    v_mov_b32_e32 v1, s2
1705; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
1706; GFX11W32-NEXT:  .LBB6_2:
1707; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1708; GFX11W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
1709; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1710; GFX11W32-NEXT:    v_mul_lo_u32 v0, s0, v0
1711; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1712; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
1713; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1714; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1715; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1716; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
1717; GFX11W32-NEXT:    s_endpgm
1718;
1719; GFX12W64-LABEL: sub_i32_uniform:
1720; GFX12W64:       ; %bb.0: ; %entry
1721; GFX12W64-NEXT:    s_load_b32 s6, s[4:5], 0x44
1722; GFX12W64-NEXT:    s_mov_b64 s[2:3], exec
1723; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
1724; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1725; GFX12W64-NEXT:    ; implicit-def: $vgpr1
1726; GFX12W64-NEXT:    s_wait_alu 0xfffe
1727; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1728; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1729; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1730; GFX12W64-NEXT:    s_cbranch_execz .LBB6_2
1731; GFX12W64-NEXT:  ; %bb.1:
1732; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1733; GFX12W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1734; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
1735; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1736; GFX12W64-NEXT:    s_wait_alu 0xfffe
1737; GFX12W64-NEXT:    s_mul_i32 s2, s6, s2
1738; GFX12W64-NEXT:    s_wait_alu 0xfffe
1739; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
1740; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
1741; GFX12W64-NEXT:  .LBB6_2:
1742; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1743; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1744; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1745; GFX12W64-NEXT:    v_mul_lo_u32 v0, s6, v0
1746; GFX12W64-NEXT:    s_wait_loadcnt 0x0
1747; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
1748; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
1749; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1750; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1751; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1752; GFX12W64-NEXT:    s_endpgm
1753;
1754; GFX12W32-LABEL: sub_i32_uniform:
1755; GFX12W32:       ; %bb.0: ; %entry
1756; GFX12W32-NEXT:    s_load_b32 s0, s[4:5], 0x44
1757; GFX12W32-NEXT:    s_mov_b32 s2, exec_lo
1758; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
1759; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1760; GFX12W32-NEXT:    ; implicit-def: $vgpr1
1761; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1762; GFX12W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1763; GFX12W32-NEXT:    s_cbranch_execz .LBB6_2
1764; GFX12W32-NEXT:  ; %bb.1:
1765; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1766; GFX12W32-NEXT:    s_wait_alu 0xfffe
1767; GFX12W32-NEXT:    s_bcnt1_i32_b32 s2, s2
1768; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1769; GFX12W32-NEXT:    s_wait_alu 0xfffe
1770; GFX12W32-NEXT:    s_mul_i32 s2, s0, s2
1771; GFX12W32-NEXT:    s_wait_alu 0xfffe
1772; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
1773; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
1774; GFX12W32-NEXT:  .LBB6_2:
1775; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1776; GFX12W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
1777; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1778; GFX12W32-NEXT:    v_mul_lo_u32 v0, s0, v0
1779; GFX12W32-NEXT:    s_wait_loadcnt 0x0
1780; GFX12W32-NEXT:    v_readfirstlane_b32 s0, v1
1781; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
1782; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1783; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1784; GFX12W32-NEXT:    global_store_b32 v1, v0, s[2:3]
1785; GFX12W32-NEXT:    s_endpgm
1786entry:
1787  %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0)
1788  store i32 %old, ptr addrspace(1) %out
1789  ret void
1790}
1791
1792define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
1793; GFX6-LABEL: sub_i32_varying_vdata:
1794; GFX6:       ; %bb.0: ; %entry
1795; GFX6-NEXT:    s_mov_b64 s[0:1], exec
1796; GFX6-NEXT:    s_mov_b32 s2, 0
1797; GFX6-NEXT:    ; implicit-def: $vgpr1
1798; GFX6-NEXT:  .LBB7_1: ; %ComputeLoop
1799; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
1800; GFX6-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1801; GFX6-NEXT:    s_mov_b32 m0, s3
1802; GFX6-NEXT:    v_readlane_b32 s8, v0, s3
1803; GFX6-NEXT:    v_writelane_b32 v1, s2, m0
1804; GFX6-NEXT:    s_lshl_b64 s[6:7], 1, s3
1805; GFX6-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
1806; GFX6-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
1807; GFX6-NEXT:    s_and_b64 vcc, exec, s[6:7]
1808; GFX6-NEXT:    s_add_i32 s2, s2, s8
1809; GFX6-NEXT:    s_cbranch_vccnz .LBB7_1
1810; GFX6-NEXT:  ; %bb.2: ; %ComputeEnd
1811; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1812; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
1813; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1814; GFX6-NEXT:    ; implicit-def: $vgpr0
1815; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1816; GFX6-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1817; GFX6-NEXT:    s_cbranch_execz .LBB7_4
1818; GFX6-NEXT:  ; %bb.3:
1819; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
1820; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1821; GFX6-NEXT:    v_mov_b32_e32 v2, 0
1822; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1823; GFX6-NEXT:    buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
1824; GFX6-NEXT:  .LBB7_4:
1825; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
1826; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1827; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1828; GFX6-NEXT:    s_mov_b32 s2, -1
1829; GFX6-NEXT:    s_waitcnt vmcnt(0)
1830; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
1831; GFX6-NEXT:    s_waitcnt expcnt(0)
1832; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v1
1833; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1834; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1835; GFX6-NEXT:    s_endpgm
1836;
1837; GFX8-LABEL: sub_i32_varying_vdata:
1838; GFX8:       ; %bb.0: ; %entry
1839; GFX8-NEXT:    s_mov_b64 s[0:1], exec
1840; GFX8-NEXT:    s_mov_b32 s2, 0
1841; GFX8-NEXT:    ; implicit-def: $vgpr1
1842; GFX8-NEXT:  .LBB7_1: ; %ComputeLoop
1843; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1844; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1845; GFX8-NEXT:    s_mov_b32 m0, s3
1846; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
1847; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
1848; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
1849; GFX8-NEXT:    s_add_i32 s2, s2, s8
1850; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
1851; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
1852; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
1853; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
1854; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1855; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1856; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1857; GFX8-NEXT:    ; implicit-def: $vgpr0
1858; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1859; GFX8-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1860; GFX8-NEXT:    s_cbranch_execz .LBB7_4
1861; GFX8-NEXT:  ; %bb.3:
1862; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1863; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1864; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1865; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1866; GFX8-NEXT:    buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
1867; GFX8-NEXT:  .LBB7_4:
1868; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1869; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1870; GFX8-NEXT:    s_waitcnt vmcnt(0)
1871; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1872; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v1
1873; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1874; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1875; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1876; GFX8-NEXT:    flat_store_dword v[0:1], v2
1877; GFX8-NEXT:    s_endpgm
1878;
1879; GFX9-LABEL: sub_i32_varying_vdata:
1880; GFX9:       ; %bb.0: ; %entry
1881; GFX9-NEXT:    s_mov_b64 s[0:1], exec
1882; GFX9-NEXT:    s_mov_b32 s2, 0
1883; GFX9-NEXT:    ; implicit-def: $vgpr1
1884; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
1885; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
1886; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1887; GFX9-NEXT:    s_mov_b32 m0, s3
1888; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
1889; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
1890; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
1891; GFX9-NEXT:    s_add_i32 s2, s2, s8
1892; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
1893; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
1894; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
1895; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
1896; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1897; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1898; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1899; GFX9-NEXT:    ; implicit-def: $vgpr0
1900; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1901; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1902; GFX9-NEXT:    s_cbranch_execz .LBB7_4
1903; GFX9-NEXT:  ; %bb.3:
1904; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1905; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1906; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1907; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1908; GFX9-NEXT:    buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
1909; GFX9-NEXT:  .LBB7_4:
1910; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1911; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1912; GFX9-NEXT:    s_waitcnt vmcnt(0)
1913; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1914; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1915; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v1
1916; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1917; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
1918; GFX9-NEXT:    s_endpgm
1919;
1920; GFX10W64-LABEL: sub_i32_varying_vdata:
1921; GFX10W64:       ; %bb.0: ; %entry
1922; GFX10W64-NEXT:    s_mov_b64 s[0:1], exec
1923; GFX10W64-NEXT:    s_mov_b32 s2, 0
1924; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1925; GFX10W64-NEXT:  .LBB7_1: ; %ComputeLoop
1926; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1927; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1928; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
1929; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
1930; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
1931; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
1932; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
1933; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
1934; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
1935; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
1936; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1937; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1938; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1939; GFX10W64-NEXT:    ; implicit-def: $vgpr0
1940; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1941; GFX10W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1942; GFX10W64-NEXT:    s_cbranch_execz .LBB7_4
1943; GFX10W64-NEXT:  ; %bb.3:
1944; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1945; GFX10W64-NEXT:    v_mov_b32_e32 v0, s2
1946; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
1947; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1948; GFX10W64-NEXT:    buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
1949; GFX10W64-NEXT:  .LBB7_4:
1950; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1951; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1952; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1953; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1954; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
1955; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
1956; GFX10W64-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
1957; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1958; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
1959; GFX10W64-NEXT:    s_endpgm
1960;
1961; GFX10W32-LABEL: sub_i32_varying_vdata:
1962; GFX10W32:       ; %bb.0: ; %entry
1963; GFX10W32-NEXT:    s_mov_b32 s1, exec_lo
1964; GFX10W32-NEXT:    s_mov_b32 s0, 0
1965; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1966; GFX10W32-NEXT:  .LBB7_1: ; %ComputeLoop
1967; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1968; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
1969; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
1970; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
1971; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
1972; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
1973; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
1974; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
1975; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
1976; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
1977; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1978; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1979; GFX10W32-NEXT:    ; implicit-def: $vgpr0
1980; GFX10W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1981; GFX10W32-NEXT:    s_xor_b32 s1, exec_lo, s1
1982; GFX10W32-NEXT:    s_cbranch_execz .LBB7_4
1983; GFX10W32-NEXT:  ; %bb.3:
1984; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1985; GFX10W32-NEXT:    v_mov_b32_e32 v0, s0
1986; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
1987; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1988; GFX10W32-NEXT:    buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
1989; GFX10W32-NEXT:  .LBB7_4:
1990; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1991; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1992; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1993; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1994; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
1995; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
1996; GFX10W32-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
1997; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1998; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
1999; GFX10W32-NEXT:    s_endpgm
2000;
2001; GFX11W64-LABEL: sub_i32_varying_vdata:
2002; GFX11W64:       ; %bb.0: ; %entry
2003; GFX11W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
2004; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
2005; GFX11W64-NEXT:    s_mov_b32 s2, 0
2006; GFX11W64-NEXT:    ; implicit-def: $vgpr0
2007; GFX11W64-NEXT:  .LBB7_1: ; %ComputeLoop
2008; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
2009; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
2010; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
2011; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
2012; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
2013; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
2014; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
2015; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2016; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
2017; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
2018; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
2019; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
2020; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
2021; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2022; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
2023; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2024; GFX11W64-NEXT:    ; implicit-def: $vgpr1
2025; GFX11W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2026; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2027; GFX11W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
2028; GFX11W64-NEXT:    s_cbranch_execz .LBB7_4
2029; GFX11W64-NEXT:  ; %bb.3:
2030; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
2031; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
2032; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
2033; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
2034; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
2035; GFX11W64-NEXT:  .LBB7_4:
2036; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2037; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2038; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
2039; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
2040; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
2041; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2042; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
2043; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
2044; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
2045; GFX11W64-NEXT:    s_endpgm
2046;
2047; GFX11W32-LABEL: sub_i32_varying_vdata:
2048; GFX11W32:       ; %bb.0: ; %entry
2049; GFX11W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
2050; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
2051; GFX11W32-NEXT:    s_mov_b32 s0, 0
2052; GFX11W32-NEXT:    ; implicit-def: $vgpr0
2053; GFX11W32-NEXT:  .LBB7_1: ; %ComputeLoop
2054; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
2055; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
2056; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
2057; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
2058; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
2059; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
2060; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
2061; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2062; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
2063; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
2064; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
2065; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
2066; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
2067; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2068; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2069; GFX11W32-NEXT:    ; implicit-def: $vgpr1
2070; GFX11W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2071; GFX11W32-NEXT:    s_xor_b32 s1, exec_lo, s1
2072; GFX11W32-NEXT:    s_cbranch_execz .LBB7_4
2073; GFX11W32-NEXT:  ; %bb.3:
2074; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
2075; GFX11W32-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
2076; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
2077; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
2078; GFX11W32-NEXT:  .LBB7_4:
2079; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2080; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2081; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
2082; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
2083; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
2084; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2085; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
2086; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
2087; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
2088; GFX11W32-NEXT:    s_endpgm
2089;
2090; GFX12W64-LABEL: sub_i32_varying_vdata:
2091; GFX12W64:       ; %bb.0: ; %entry
2092; GFX12W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
2093; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
2094; GFX12W64-NEXT:    s_mov_b32 s2, 0
2095; GFX12W64-NEXT:    ; implicit-def: $vgpr0
2096; GFX12W64-NEXT:  .LBB7_1: ; %ComputeLoop
2097; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
2098; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
2099; GFX12W64-NEXT:    s_wait_alu 0xfffe
2100; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
2101; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
2102; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
2103; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
2104; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2105; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
2106; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
2107; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
2108; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
2109; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
2110; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2111; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
2112; GFX12W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2113; GFX12W64-NEXT:    ; implicit-def: $vgpr1
2114; GFX12W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2115; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2116; GFX12W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
2117; GFX12W64-NEXT:    s_cbranch_execz .LBB7_4
2118; GFX12W64-NEXT:  ; %bb.3:
2119; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
2120; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
2121; GFX12W64-NEXT:    s_wait_alu 0xfffe
2122; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
2123; GFX12W64-NEXT:    s_wait_kmcnt 0x0
2124; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
2125; GFX12W64-NEXT:  .LBB7_4:
2126; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2127; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2128; GFX12W64-NEXT:    s_wait_loadcnt 0x0
2129; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
2130; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
2131; GFX12W64-NEXT:    s_wait_alu 0xfffe
2132; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2133; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
2134; GFX12W64-NEXT:    s_wait_kmcnt 0x0
2135; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
2136; GFX12W64-NEXT:    s_endpgm
2137;
2138; GFX12W32-LABEL: sub_i32_varying_vdata:
2139; GFX12W32:       ; %bb.0: ; %entry
2140; GFX12W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
2141; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
2142; GFX12W32-NEXT:    s_mov_b32 s0, 0
2143; GFX12W32-NEXT:    ; implicit-def: $vgpr0
2144; GFX12W32-NEXT:  .LBB7_1: ; %ComputeLoop
2145; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
2146; GFX12W32-NEXT:    s_wait_alu 0xfffe
2147; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
2148; GFX12W32-NEXT:    s_wait_alu 0xfffe
2149; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
2150; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
2151; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
2152; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
2153; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2154; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
2155; GFX12W32-NEXT:    s_wait_alu 0xfffe
2156; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
2157; GFX12W32-NEXT:    s_cbranch_scc1 .LBB7_1
2158; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
2159; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
2160; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2161; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2162; GFX12W32-NEXT:    ; implicit-def: $vgpr1
2163; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2164; GFX12W32-NEXT:    s_wait_alu 0xfffe
2165; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
2166; GFX12W32-NEXT:    s_cbranch_execz .LBB7_4
2167; GFX12W32-NEXT:  ; %bb.3:
2168; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
2169; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
2170; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2171; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
2172; GFX12W32-NEXT:  .LBB7_4:
2173; GFX12W32-NEXT:    s_wait_alu 0xfffe
2174; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2175; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2176; GFX12W32-NEXT:    s_wait_loadcnt 0x0
2177; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
2178; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
2179; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2180; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
2181; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2182; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
2183; GFX12W32-NEXT:    s_endpgm
2184entry:
2185  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2186  %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0)
2187  store i32 %old, ptr addrspace(1) %out
2188  ret void
2189}
2190
2191define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
2192; GFX6-LABEL: sub_i32_varying_vindex:
2193; GFX6:       ; %bb.0: ; %entry
2194; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
2195; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
2196; GFX6-NEXT:    v_mov_b32_e32 v1, 1
2197; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2198; GFX6-NEXT:    buffer_atomic_sub v1, v0, s[0:3], 0 idxen glc
2199; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2200; GFX6-NEXT:    s_mov_b32 s6, -1
2201; GFX6-NEXT:    s_waitcnt vmcnt(0)
2202; GFX6-NEXT:    buffer_store_dword v1, off, s[4:7], 0
2203; GFX6-NEXT:    s_endpgm
2204;
2205; GFX8-LABEL: sub_i32_varying_vindex:
2206; GFX8:       ; %bb.0: ; %entry
2207; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
2208; GFX8-NEXT:    v_mov_b32_e32 v2, 1
2209; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2210; GFX8-NEXT:    buffer_atomic_sub v2, v0, s[0:3], 0 idxen glc
2211; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2212; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2213; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2214; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2215; GFX8-NEXT:    s_waitcnt vmcnt(0)
2216; GFX8-NEXT:    flat_store_dword v[0:1], v2
2217; GFX8-NEXT:    s_endpgm
2218;
2219; GFX9-LABEL: sub_i32_varying_vindex:
2220; GFX9:       ; %bb.0: ; %entry
2221; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
2222; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2223; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2224; GFX9-NEXT:    buffer_atomic_sub v1, v0, s[0:3], 0 idxen glc
2225; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2226; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2227; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2228; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2229; GFX9-NEXT:    s_endpgm
2230;
2231; GFX10-LABEL: sub_i32_varying_vindex:
2232; GFX10:       ; %bb.0: ; %entry
2233; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
2234; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2235; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2236; GFX10-NEXT:    buffer_atomic_sub v1, v0, s[0:3], 0 idxen glc
2237; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2238; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2239; GFX10-NEXT:    v_mov_b32_e32 v0, 0
2240; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2241; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2242; GFX10-NEXT:    s_endpgm
2243;
2244; GFX11W64-LABEL: sub_i32_varying_vindex:
2245; GFX11W64:       ; %bb.0: ; %entry
2246; GFX11W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2247; GFX11W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2248; GFX11W64-NEXT:    v_mov_b32_e32 v1, 1
2249; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
2250; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, v0, s[0:3], 0 idxen glc
2251; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2252; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
2253; GFX11W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2254; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
2255; GFX11W64-NEXT:    s_endpgm
2256;
2257; GFX11W32-LABEL: sub_i32_varying_vindex:
2258; GFX11W32:       ; %bb.0: ; %entry
2259; GFX11W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2260; GFX11W32-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
2261; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
2262; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, v0, s[0:3], 0 idxen glc
2263; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2264; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
2265; GFX11W32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2266; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
2267; GFX11W32-NEXT:    s_endpgm
2268;
2269; GFX12W64-LABEL: sub_i32_varying_vindex:
2270; GFX12W64:       ; %bb.0: ; %entry
2271; GFX12W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2272; GFX12W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2273; GFX12W64-NEXT:    v_mov_b32_e32 v1, 1
2274; GFX12W64-NEXT:    s_wait_kmcnt 0x0
2275; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN
2276; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2277; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
2278; GFX12W64-NEXT:    s_wait_loadcnt 0x0
2279; GFX12W64-NEXT:    s_wait_kmcnt 0x0
2280; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
2281; GFX12W64-NEXT:    s_endpgm
2282;
2283; GFX12W32-LABEL: sub_i32_varying_vindex:
2284; GFX12W32:       ; %bb.0: ; %entry
2285; GFX12W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2286; GFX12W32-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
2287; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2288; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN
2289; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2290; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
2291; GFX12W32-NEXT:    s_wait_loadcnt 0x0
2292; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2293; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
2294; GFX12W32-NEXT:    s_endpgm
2295entry:
2296  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2297  %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0, i32 0)
2298  store i32 %old, ptr addrspace(1) %out
2299  ret void
2300}
2301
2302define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
2303; GFX6-LABEL: sub_i32_varying_offset:
2304; GFX6:       ; %bb.0: ; %entry
2305; GFX6-NEXT:    v_mov_b32_e32 v1, v0
2306; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
2307; GFX6-NEXT:    s_mov_b32 s6, 0
2308; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
2309; GFX6-NEXT:    v_mov_b32_e32 v0, s6
2310; GFX6-NEXT:    v_mov_b32_e32 v2, 1
2311; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2312; GFX6-NEXT:    buffer_atomic_sub v2, v[0:1], s[0:3], 0 idxen offen glc
2313; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2314; GFX6-NEXT:    s_mov_b32 s6, -1
2315; GFX6-NEXT:    s_waitcnt vmcnt(0)
2316; GFX6-NEXT:    buffer_store_dword v2, off, s[4:7], 0
2317; GFX6-NEXT:    s_endpgm
2318;
2319; GFX8-LABEL: sub_i32_varying_offset:
2320; GFX8:       ; %bb.0: ; %entry
2321; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
2322; GFX8-NEXT:    s_mov_b32 s6, 0
2323; GFX8-NEXT:    v_mov_b32_e32 v1, v0
2324; GFX8-NEXT:    v_mov_b32_e32 v0, s6
2325; GFX8-NEXT:    v_mov_b32_e32 v2, 1
2326; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2327; GFX8-NEXT:    buffer_atomic_sub v2, v[0:1], s[0:3], 0 idxen offen glc
2328; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2329; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2330; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2331; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2332; GFX8-NEXT:    s_waitcnt vmcnt(0)
2333; GFX8-NEXT:    flat_store_dword v[0:1], v2
2334; GFX8-NEXT:    s_endpgm
2335;
2336; GFX9-LABEL: sub_i32_varying_offset:
2337; GFX9:       ; %bb.0: ; %entry
2338; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
2339; GFX9-NEXT:    s_mov_b32 s6, 0
2340; GFX9-NEXT:    v_mov_b32_e32 v1, v0
2341; GFX9-NEXT:    v_mov_b32_e32 v0, s6
2342; GFX9-NEXT:    v_mov_b32_e32 v2, 1
2343; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2344; GFX9-NEXT:    buffer_atomic_sub v2, v[0:1], s[0:3], 0 idxen offen glc
2345; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2346; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2347; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2348; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
2349; GFX9-NEXT:    s_endpgm
2350;
2351; GFX10-LABEL: sub_i32_varying_offset:
2352; GFX10:       ; %bb.0: ; %entry
2353; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
2354; GFX10-NEXT:    s_mov_b32 s6, 0
2355; GFX10-NEXT:    v_mov_b32_e32 v1, v0
2356; GFX10-NEXT:    v_mov_b32_e32 v0, s6
2357; GFX10-NEXT:    v_mov_b32_e32 v2, 1
2358; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2359; GFX10-NEXT:    buffer_atomic_sub v2, v[0:1], s[0:3], 0 idxen offen glc
2360; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2361; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2362; GFX10-NEXT:    v_mov_b32_e32 v0, 0
2363; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2364; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
2365; GFX10-NEXT:    s_endpgm
2366;
2367; GFX11W64-LABEL: sub_i32_varying_offset:
2368; GFX11W64:       ; %bb.0: ; %entry
2369; GFX11W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2370; GFX11W64-NEXT:    s_mov_b32 s6, 0
2371; GFX11W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
2372; GFX11W64-NEXT:    v_mov_b32_e32 v0, s6
2373; GFX11W64-NEXT:    v_mov_b32_e32 v2, 1
2374; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
2375; GFX11W64-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
2376; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2377; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
2378; GFX11W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2379; GFX11W64-NEXT:    global_store_b32 v0, v2, s[0:1]
2380; GFX11W64-NEXT:    s_endpgm
2381;
2382; GFX11W32-LABEL: sub_i32_varying_offset:
2383; GFX11W32:       ; %bb.0: ; %entry
2384; GFX11W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2385; GFX11W32-NEXT:    s_mov_b32 s6, 0
2386; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2387; GFX11W32-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_and_b32 v1, 0x3ff, v0
2388; GFX11W32-NEXT:    v_mov_b32_e32 v2, 1
2389; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
2390; GFX11W32-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
2391; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2392; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
2393; GFX11W32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2394; GFX11W32-NEXT:    global_store_b32 v0, v2, s[0:1]
2395; GFX11W32-NEXT:    s_endpgm
2396;
2397; GFX12W64-LABEL: sub_i32_varying_offset:
2398; GFX12W64:       ; %bb.0: ; %entry
2399; GFX12W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2400; GFX12W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
2401; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
2402; GFX12W64-NEXT:    v_mov_b32_e32 v2, 1
2403; GFX12W64-NEXT:    s_wait_kmcnt 0x0
2404; GFX12W64-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
2405; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2406; GFX12W64-NEXT:    s_wait_loadcnt 0x0
2407; GFX12W64-NEXT:    s_wait_kmcnt 0x0
2408; GFX12W64-NEXT:    global_store_b32 v0, v2, s[0:1]
2409; GFX12W64-NEXT:    s_endpgm
2410;
2411; GFX12W32-LABEL: sub_i32_varying_offset:
2412; GFX12W32:       ; %bb.0: ; %entry
2413; GFX12W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2414; GFX12W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0
2415; GFX12W32-NEXT:    v_mov_b32_e32 v2, 1
2416; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2417; GFX12W32-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
2418; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2419; GFX12W32-NEXT:    s_wait_loadcnt 0x0
2420; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2421; GFX12W32-NEXT:    global_store_b32 v0, v2, s[0:1]
2422; GFX12W32-NEXT:    s_endpgm
2423entry:
2424  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2425  %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 0, i32 %lane, i32 0, i32 0)
2426  store i32 %old, ptr addrspace(1) %out
2427  ret void
2428}
2429;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2430; GFX11: {{.*}}
2431; GFX12: {{.*}}
2432