xref: /llvm-project/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
9; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
10; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
11
12declare i32 @llvm.amdgcn.workitem.id.x()
13declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32)
14declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32, ptr addrspace(8), i32, i32, i32)
15
16; Show what the atomic optimization pass will do for raw buffers.
17
18define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
19; GFX6-LABEL: add_i32_constant:
20; GFX6:       ; %bb.0: ; %entry
21; GFX6-NEXT:    s_mov_b64 s[2:3], exec
22; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
23; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
24; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX6-NEXT:    ; implicit-def: $vgpr1
26; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
27; GFX6-NEXT:    s_cbranch_execz .LBB0_2
28; GFX6-NEXT:  ; %bb.1:
29; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
30; GFX6-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
31; GFX6-NEXT:    s_mul_i32 s2, s2, 5
32; GFX6-NEXT:    v_mov_b32_e32 v1, s2
33; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
34; GFX6-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
35; GFX6-NEXT:  .LBB0_2:
36; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
37; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
38; GFX6-NEXT:    s_mov_b32 s3, 0xf000
39; GFX6-NEXT:    s_mov_b32 s2, -1
40; GFX6-NEXT:    s_waitcnt vmcnt(0)
41; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
42; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
43; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
45; GFX6-NEXT:    s_endpgm
46;
47; GFX8-LABEL: add_i32_constant:
48; GFX8:       ; %bb.0: ; %entry
49; GFX8-NEXT:    s_mov_b64 s[2:3], exec
50; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
51; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
52; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
53; GFX8-NEXT:    ; implicit-def: $vgpr1
54; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
55; GFX8-NEXT:    s_cbranch_execz .LBB0_2
56; GFX8-NEXT:  ; %bb.1:
57; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
58; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
59; GFX8-NEXT:    s_mul_i32 s2, s2, 5
60; GFX8-NEXT:    v_mov_b32_e32 v1, s2
61; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
62; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
63; GFX8-NEXT:  .LBB0_2:
64; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
65; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
66; GFX8-NEXT:    s_waitcnt vmcnt(0)
67; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
68; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s2
69; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
70; GFX8-NEXT:    v_mov_b32_e32 v0, s0
71; GFX8-NEXT:    v_mov_b32_e32 v1, s1
72; GFX8-NEXT:    flat_store_dword v[0:1], v2
73; GFX8-NEXT:    s_endpgm
74;
75; GFX9-LABEL: add_i32_constant:
76; GFX9:       ; %bb.0: ; %entry
77; GFX9-NEXT:    s_mov_b64 s[2:3], exec
78; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
79; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
80; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
81; GFX9-NEXT:    ; implicit-def: $vgpr1
82; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
83; GFX9-NEXT:    s_cbranch_execz .LBB0_2
84; GFX9-NEXT:  ; %bb.1:
85; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
86; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
87; GFX9-NEXT:    s_mul_i32 s2, s2, 5
88; GFX9-NEXT:    v_mov_b32_e32 v1, s2
89; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
90; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
91; GFX9-NEXT:  .LBB0_2:
92; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
93; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
94; GFX9-NEXT:    s_waitcnt vmcnt(0)
95; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
96; GFX9-NEXT:    v_mov_b32_e32 v2, 0
97; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
98; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
99; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
100; GFX9-NEXT:    s_endpgm
101;
102; GFX10W64-LABEL: add_i32_constant:
103; GFX10W64:       ; %bb.0: ; %entry
104; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
105; GFX10W64-NEXT:    ; implicit-def: $vgpr1
106; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
107; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
108; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
109; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
110; GFX10W64-NEXT:    s_cbranch_execz .LBB0_2
111; GFX10W64-NEXT:  ; %bb.1:
112; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
113; GFX10W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
114; GFX10W64-NEXT:    s_mul_i32 s2, s2, 5
115; GFX10W64-NEXT:    v_mov_b32_e32 v1, s2
116; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
117; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
118; GFX10W64-NEXT:  .LBB0_2:
119; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
120; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
121; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
122; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
123; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
124; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
125; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
126; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
128; GFX10W64-NEXT:    s_endpgm
129;
130; GFX10W32-LABEL: add_i32_constant:
131; GFX10W32:       ; %bb.0: ; %entry
132; GFX10W32-NEXT:    s_mov_b32 s1, exec_lo
133; GFX10W32-NEXT:    ; implicit-def: $vgpr1
134; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
135; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
136; GFX10W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
137; GFX10W32-NEXT:    s_cbranch_execz .LBB0_2
138; GFX10W32-NEXT:  ; %bb.1:
139; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
140; GFX10W32-NEXT:    s_bcnt1_i32_b32 s1, s1
141; GFX10W32-NEXT:    s_mul_i32 s1, s1, 5
142; GFX10W32-NEXT:    v_mov_b32_e32 v1, s1
143; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
144; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
145; GFX10W32-NEXT:  .LBB0_2:
146; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
147; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
148; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
149; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
150; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
151; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
152; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
153; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
154; GFX10W32-NEXT:    global_store_dword v1, v0, s[0:1]
155; GFX10W32-NEXT:    s_endpgm
156;
157; GFX11W64-LABEL: add_i32_constant:
158; GFX11W64:       ; %bb.0: ; %entry
159; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
160; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
161; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
162; GFX11W64-NEXT:    ; implicit-def: $vgpr1
163; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
164; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
165; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
166; GFX11W64-NEXT:    s_cbranch_execz .LBB0_2
167; GFX11W64-NEXT:  ; %bb.1:
168; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
169; GFX11W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
170; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
171; GFX11W64-NEXT:    s_mul_i32 s2, s2, 5
172; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
173; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
174; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
175; GFX11W64-NEXT:  .LBB0_2:
176; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
177; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
178; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
179; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
180; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
181; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
182; GFX11W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
183; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
185; GFX11W64-NEXT:    s_endpgm
186;
187; GFX11W32-LABEL: add_i32_constant:
188; GFX11W32:       ; %bb.0: ; %entry
189; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
190; GFX11W32-NEXT:    s_mov_b32 s0, exec_lo
191; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
192; GFX11W32-NEXT:    ; implicit-def: $vgpr1
193; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
194; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
195; GFX11W32-NEXT:    s_cbranch_execz .LBB0_2
196; GFX11W32-NEXT:  ; %bb.1:
197; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
198; GFX11W32-NEXT:    s_bcnt1_i32_b32 s1, s1
199; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
200; GFX11W32-NEXT:    s_mul_i32 s1, s1, 5
201; GFX11W32-NEXT:    v_mov_b32_e32 v1, s1
202; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
203; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
204; GFX11W32-NEXT:  .LBB0_2:
205; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
206; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
207; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
208; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
209; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
210; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
211; GFX11W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
212; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
214; GFX11W32-NEXT:    s_endpgm
215;
216; GFX12W64-LABEL: add_i32_constant:
217; GFX12W64:       ; %bb.0: ; %entry
218; GFX12W64-NEXT:    s_mov_b64 s[2:3], exec
219; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
220; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
221; GFX12W64-NEXT:    ; implicit-def: $vgpr1
222; GFX12W64-NEXT:    s_wait_alu 0xfffe
223; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
224; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
225; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
226; GFX12W64-NEXT:    s_cbranch_execz .LBB0_2
227; GFX12W64-NEXT:  ; %bb.1:
228; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
229; GFX12W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
230; GFX12W64-NEXT:    s_wait_alu 0xfffe
231; GFX12W64-NEXT:    s_mul_i32 s2, s2, 5
232; GFX12W64-NEXT:    s_wait_alu 0xfffe
233; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
234; GFX12W64-NEXT:    s_wait_kmcnt 0x0
235; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
236; GFX12W64-NEXT:  .LBB0_2:
237; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
238; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
239; GFX12W64-NEXT:    s_wait_loadcnt 0x0
240; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
241; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
242; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
243; GFX12W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
244; GFX12W64-NEXT:    s_wait_kmcnt 0x0
245; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
246; GFX12W64-NEXT:    s_endpgm
247;
248; GFX12W32-LABEL: add_i32_constant:
249; GFX12W32:       ; %bb.0: ; %entry
250; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
251; GFX12W32-NEXT:    s_mov_b32 s0, exec_lo
252; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
253; GFX12W32-NEXT:    ; implicit-def: $vgpr1
254; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
255; GFX12W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
256; GFX12W32-NEXT:    s_cbranch_execz .LBB0_2
257; GFX12W32-NEXT:  ; %bb.1:
258; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
259; GFX12W32-NEXT:    s_wait_alu 0xfffe
260; GFX12W32-NEXT:    s_bcnt1_i32_b32 s1, s1
261; GFX12W32-NEXT:    s_wait_alu 0xfffe
262; GFX12W32-NEXT:    s_mul_i32 s1, s1, 5
263; GFX12W32-NEXT:    s_wait_alu 0xfffe
264; GFX12W32-NEXT:    v_mov_b32_e32 v1, s1
265; GFX12W32-NEXT:    s_wait_kmcnt 0x0
266; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
267; GFX12W32-NEXT:  .LBB0_2:
268; GFX12W32-NEXT:    s_wait_alu 0xfffe
269; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
270; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
271; GFX12W32-NEXT:    s_wait_loadcnt 0x0
272; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
273; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
274; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
275; GFX12W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
276; GFX12W32-NEXT:    s_wait_kmcnt 0x0
277; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
278; GFX12W32-NEXT:    s_endpgm
279entry:
280  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
281  store i32 %old, ptr addrspace(1) %out
282  ret void
283}
284
285define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) {
286; GFX6-LABEL: add_i32_uniform:
287; GFX6:       ; %bb.0: ; %entry
288; GFX6-NEXT:    s_mov_b64 s[2:3], exec
289; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x11
290; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
291; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
292; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
293; GFX6-NEXT:    ; implicit-def: $vgpr1
294; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
295; GFX6-NEXT:    s_cbranch_execz .LBB1_2
296; GFX6-NEXT:  ; %bb.1:
297; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
298; GFX6-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
299; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
300; GFX6-NEXT:    s_mul_i32 s2, s6, s2
301; GFX6-NEXT:    v_mov_b32_e32 v1, s2
302; GFX6-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
303; GFX6-NEXT:  .LBB1_2:
304; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
305; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
306; GFX6-NEXT:    s_mov_b32 s3, 0xf000
307; GFX6-NEXT:    s_mov_b32 s2, -1
308; GFX6-NEXT:    s_waitcnt vmcnt(0)
309; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
310; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
311; GFX6-NEXT:    v_mul_lo_u32 v0, s6, v0
312; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
313; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
314; GFX6-NEXT:    s_endpgm
315;
316; GFX8-LABEL: add_i32_uniform:
317; GFX8:       ; %bb.0: ; %entry
318; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x44
319; GFX8-NEXT:    s_mov_b64 s[2:3], exec
320; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
321; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
322; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
323; GFX8-NEXT:    ; implicit-def: $vgpr1
324; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
325; GFX8-NEXT:    s_cbranch_execz .LBB1_2
326; GFX8-NEXT:  ; %bb.1:
327; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
328; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
329; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
330; GFX8-NEXT:    s_mul_i32 s2, s6, s2
331; GFX8-NEXT:    v_mov_b32_e32 v1, s2
332; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
333; GFX8-NEXT:  .LBB1_2:
334; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
335; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
336; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
337; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
338; GFX8-NEXT:    s_waitcnt vmcnt(0)
339; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
340; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v0
341; GFX8-NEXT:    v_mov_b32_e32 v0, s0
342; GFX8-NEXT:    v_mov_b32_e32 v1, s1
343; GFX8-NEXT:    flat_store_dword v[0:1], v2
344; GFX8-NEXT:    s_endpgm
345;
346; GFX9-LABEL: add_i32_uniform:
347; GFX9:       ; %bb.0: ; %entry
348; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x44
349; GFX9-NEXT:    s_mov_b64 s[2:3], exec
350; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
351; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
352; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
353; GFX9-NEXT:    ; implicit-def: $vgpr1
354; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
355; GFX9-NEXT:    s_cbranch_execz .LBB1_2
356; GFX9-NEXT:  ; %bb.1:
357; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
358; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
359; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX9-NEXT:    s_mul_i32 s2, s6, s2
361; GFX9-NEXT:    v_mov_b32_e32 v1, s2
362; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
363; GFX9-NEXT:  .LBB1_2:
364; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
365; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
366; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
367; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
368; GFX9-NEXT:    s_waitcnt vmcnt(0)
369; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
370; GFX9-NEXT:    v_mov_b32_e32 v2, 0
371; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
372; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
373; GFX9-NEXT:    s_endpgm
374;
375; GFX10W64-LABEL: add_i32_uniform:
376; GFX10W64:       ; %bb.0: ; %entry
377; GFX10W64-NEXT:    s_load_dword s6, s[4:5], 0x44
378; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
379; GFX10W64-NEXT:    ; implicit-def: $vgpr1
380; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
381; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
382; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
383; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
384; GFX10W64-NEXT:    s_cbranch_execz .LBB1_2
385; GFX10W64-NEXT:  ; %bb.1:
386; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
387; GFX10W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
388; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
389; GFX10W64-NEXT:    s_mul_i32 s2, s6, s2
390; GFX10W64-NEXT:    v_mov_b32_e32 v1, s2
391; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
392; GFX10W64-NEXT:  .LBB1_2:
393; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
394; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
395; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
396; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
397; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
398; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
400; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
401; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
402; GFX10W64-NEXT:    s_endpgm
403;
404; GFX10W32-LABEL: add_i32_uniform:
405; GFX10W32:       ; %bb.0: ; %entry
406; GFX10W32-NEXT:    s_load_dword s0, s[4:5], 0x44
407; GFX10W32-NEXT:    s_mov_b32 s2, exec_lo
408; GFX10W32-NEXT:    ; implicit-def: $vgpr1
409; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
410; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
411; GFX10W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
412; GFX10W32-NEXT:    s_cbranch_execz .LBB1_2
413; GFX10W32-NEXT:  ; %bb.1:
414; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
415; GFX10W32-NEXT:    s_bcnt1_i32_b32 s2, s2
416; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
417; GFX10W32-NEXT:    s_mul_i32 s2, s0, s2
418; GFX10W32-NEXT:    v_mov_b32_e32 v1, s2
419; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
420; GFX10W32-NEXT:  .LBB1_2:
421; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
422; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
423; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
424; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
425; GFX10W32-NEXT:    s_mov_b32 null, 0
426; GFX10W32-NEXT:    v_readfirstlane_b32 s4, v1
427; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
428; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5]
429; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
430; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
431; GFX10W32-NEXT:    s_endpgm
432;
433; GFX11W64-LABEL: add_i32_uniform:
434; GFX11W64:       ; %bb.0: ; %entry
435; GFX11W64-NEXT:    s_load_b32 s6, s[4:5], 0x44
436; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
437; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
438; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
439; GFX11W64-NEXT:    ; implicit-def: $vgpr1
440; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
441; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
442; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
443; GFX11W64-NEXT:    s_cbranch_execz .LBB1_2
444; GFX11W64-NEXT:  ; %bb.1:
445; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
446; GFX11W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
447; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
448; GFX11W64-NEXT:    s_mul_i32 s2, s6, s2
449; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
450; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
451; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
452; GFX11W64-NEXT:  .LBB1_2:
453; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
454; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
455; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
456; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
457; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
458; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
459; GFX11W64-NEXT:    v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
460; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
461; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
462; GFX11W64-NEXT:    s_endpgm
463;
464; GFX11W32-LABEL: add_i32_uniform:
465; GFX11W32:       ; %bb.0: ; %entry
466; GFX11W32-NEXT:    s_load_b32 s0, s[4:5], 0x44
467; GFX11W32-NEXT:    s_mov_b32 s2, exec_lo
468; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
469; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
470; GFX11W32-NEXT:    ; implicit-def: $vgpr1
471; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
472; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
473; GFX11W32-NEXT:    s_cbranch_execz .LBB1_2
474; GFX11W32-NEXT:  ; %bb.1:
475; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
476; GFX11W32-NEXT:    s_bcnt1_i32_b32 s2, s2
477; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
478; GFX11W32-NEXT:    s_mul_i32 s2, s0, s2
479; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
480; GFX11W32-NEXT:    v_mov_b32_e32 v1, s2
481; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
482; GFX11W32-NEXT:  .LBB1_2:
483; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
484; GFX11W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
485; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
486; GFX11W32-NEXT:    v_readfirstlane_b32 s4, v1
487; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
488; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
489; GFX11W32-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5]
490; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
491; GFX11W32-NEXT:    global_store_b32 v0, v1, s[2:3]
492; GFX11W32-NEXT:    s_endpgm
493;
494; GFX12W64-LABEL: add_i32_uniform:
495; GFX12W64:       ; %bb.0: ; %entry
496; GFX12W64-NEXT:    s_load_b32 s6, s[4:5], 0x44
497; GFX12W64-NEXT:    s_mov_b64 s[2:3], exec
498; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
499; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
500; GFX12W64-NEXT:    ; implicit-def: $vgpr1
501; GFX12W64-NEXT:    s_wait_alu 0xfffe
502; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
503; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
504; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
505; GFX12W64-NEXT:    s_cbranch_execz .LBB1_2
506; GFX12W64-NEXT:  ; %bb.1:
507; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
508; GFX12W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
509; GFX12W64-NEXT:    s_wait_kmcnt 0x0
510; GFX12W64-NEXT:    s_wait_alu 0xfffe
511; GFX12W64-NEXT:    s_mul_i32 s2, s6, s2
512; GFX12W64-NEXT:    s_wait_alu 0xfffe
513; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
514; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
515; GFX12W64-NEXT:  .LBB1_2:
516; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
517; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
518; GFX12W64-NEXT:    s_wait_loadcnt 0x0
519; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
520; GFX12W64-NEXT:    s_wait_kmcnt 0x0
521; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
522; GFX12W64-NEXT:    v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
523; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
524; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
525; GFX12W64-NEXT:    s_endpgm
526;
527; GFX12W32-LABEL: add_i32_uniform:
528; GFX12W32:       ; %bb.0: ; %entry
529; GFX12W32-NEXT:    s_load_b32 s0, s[4:5], 0x44
530; GFX12W32-NEXT:    s_mov_b32 s2, exec_lo
531; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
532; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
533; GFX12W32-NEXT:    ; implicit-def: $vgpr1
534; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
535; GFX12W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
536; GFX12W32-NEXT:    s_cbranch_execz .LBB1_2
537; GFX12W32-NEXT:  ; %bb.1:
538; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
539; GFX12W32-NEXT:    s_wait_alu 0xfffe
540; GFX12W32-NEXT:    s_bcnt1_i32_b32 s2, s2
541; GFX12W32-NEXT:    s_wait_kmcnt 0x0
542; GFX12W32-NEXT:    s_wait_alu 0xfffe
543; GFX12W32-NEXT:    s_mul_i32 s2, s0, s2
544; GFX12W32-NEXT:    s_wait_alu 0xfffe
545; GFX12W32-NEXT:    v_mov_b32_e32 v1, s2
546; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
547; GFX12W32-NEXT:  .LBB1_2:
548; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
549; GFX12W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
550; GFX12W32-NEXT:    s_wait_loadcnt 0x0
551; GFX12W32-NEXT:    v_readfirstlane_b32 s4, v1
552; GFX12W32-NEXT:    s_wait_kmcnt 0x0
553; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
554; GFX12W32-NEXT:    v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
555; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
556; GFX12W32-NEXT:    global_store_b32 v1, v0, s[2:3]
557; GFX12W32-NEXT:    s_endpgm
558entry:
559  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
560  store i32 %old, ptr addrspace(1) %out
561  ret void
562}
563
564define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
565; GFX6-LABEL: add_i32_varying_vdata:
566; GFX6:       ; %bb.0: ; %entry
567; GFX6-NEXT:    s_mov_b64 s[0:1], exec
568; GFX6-NEXT:    s_mov_b32 s2, 0
569; GFX6-NEXT:    ; implicit-def: $vgpr1
570; GFX6-NEXT:  .LBB2_1: ; %ComputeLoop
571; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
572; GFX6-NEXT:    s_ff1_i32_b64 s3, s[0:1]
573; GFX6-NEXT:    s_mov_b32 m0, s3
574; GFX6-NEXT:    v_readlane_b32 s8, v0, s3
575; GFX6-NEXT:    v_writelane_b32 v1, s2, m0
576; GFX6-NEXT:    s_lshl_b64 s[6:7], 1, s3
577; GFX6-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
578; GFX6-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
579; GFX6-NEXT:    s_and_b64 vcc, exec, s[6:7]
580; GFX6-NEXT:    s_add_i32 s2, s2, s8
581; GFX6-NEXT:    s_cbranch_vccnz .LBB2_1
582; GFX6-NEXT:  ; %bb.2: ; %ComputeEnd
583; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
584; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
585; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
586; GFX6-NEXT:    ; implicit-def: $vgpr0
587; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
588; GFX6-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
589; GFX6-NEXT:    s_cbranch_execz .LBB2_4
590; GFX6-NEXT:  ; %bb.3:
591; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
592; GFX6-NEXT:    v_mov_b32_e32 v0, s2
593; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
594; GFX6-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
595; GFX6-NEXT:  .LBB2_4:
596; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
597; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
598; GFX6-NEXT:    s_mov_b32 s3, 0xf000
599; GFX6-NEXT:    s_mov_b32 s2, -1
600; GFX6-NEXT:    s_waitcnt vmcnt(0)
601; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
602; GFX6-NEXT:    s_waitcnt expcnt(0)
603; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v1
604; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
605; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
606; GFX6-NEXT:    s_endpgm
607;
608; GFX8-LABEL: add_i32_varying_vdata:
609; GFX8:       ; %bb.0: ; %entry
610; GFX8-NEXT:    s_mov_b64 s[0:1], exec
611; GFX8-NEXT:    s_mov_b32 s2, 0
612; GFX8-NEXT:    ; implicit-def: $vgpr1
613; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
614; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
615; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
616; GFX8-NEXT:    s_mov_b32 m0, s3
617; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
618; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
619; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
620; GFX8-NEXT:    s_add_i32 s2, s2, s8
621; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
622; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
623; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
624; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
625; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
626; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
627; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
628; GFX8-NEXT:    ; implicit-def: $vgpr0
629; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
630; GFX8-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
631; GFX8-NEXT:    s_cbranch_execz .LBB2_4
632; GFX8-NEXT:  ; %bb.3:
633; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
634; GFX8-NEXT:    v_mov_b32_e32 v0, s2
635; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
636; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
637; GFX8-NEXT:  .LBB2_4:
638; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
639; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
640; GFX8-NEXT:    s_waitcnt vmcnt(0)
641; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
642; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v1
643; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
644; GFX8-NEXT:    v_mov_b32_e32 v0, s0
645; GFX8-NEXT:    v_mov_b32_e32 v1, s1
646; GFX8-NEXT:    flat_store_dword v[0:1], v2
647; GFX8-NEXT:    s_endpgm
648;
649; GFX9-LABEL: add_i32_varying_vdata:
650; GFX9:       ; %bb.0: ; %entry
651; GFX9-NEXT:    s_mov_b64 s[0:1], exec
652; GFX9-NEXT:    s_mov_b32 s2, 0
653; GFX9-NEXT:    ; implicit-def: $vgpr1
654; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
655; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
656; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
657; GFX9-NEXT:    s_mov_b32 m0, s3
658; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
659; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
660; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
661; GFX9-NEXT:    s_add_i32 s2, s2, s8
662; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
663; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
664; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
665; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
666; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
667; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
668; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
669; GFX9-NEXT:    ; implicit-def: $vgpr0
670; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
671; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
672; GFX9-NEXT:    s_cbranch_execz .LBB2_4
673; GFX9-NEXT:  ; %bb.3:
674; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
675; GFX9-NEXT:    v_mov_b32_e32 v0, s2
676; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
678; GFX9-NEXT:  .LBB2_4:
679; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
680; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
681; GFX9-NEXT:    s_waitcnt vmcnt(0)
682; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
683; GFX9-NEXT:    v_mov_b32_e32 v2, 0
684; GFX9-NEXT:    v_add_u32_e32 v0, s2, v1
685; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
686; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
687; GFX9-NEXT:    s_endpgm
688;
689; GFX10W64-LABEL: add_i32_varying_vdata:
690; GFX10W64:       ; %bb.0: ; %entry
691; GFX10W64-NEXT:    s_mov_b64 s[0:1], exec
692; GFX10W64-NEXT:    s_mov_b32 s2, 0
693; GFX10W64-NEXT:    ; implicit-def: $vgpr1
694; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
695; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
696; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
697; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
698; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
699; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
700; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
701; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
702; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
703; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
704; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
705; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
706; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
707; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
708; GFX10W64-NEXT:    ; implicit-def: $vgpr0
709; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
710; GFX10W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
711; GFX10W64-NEXT:    s_cbranch_execz .LBB2_4
712; GFX10W64-NEXT:  ; %bb.3:
713; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
714; GFX10W64-NEXT:    v_mov_b32_e32 v0, s2
715; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
716; GFX10W64-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
717; GFX10W64-NEXT:  .LBB2_4:
718; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
719; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
720; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
721; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
722; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
723; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
724; GFX10W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
725; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
726; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
727; GFX10W64-NEXT:    s_endpgm
728;
729; GFX10W32-LABEL: add_i32_varying_vdata:
730; GFX10W32:       ; %bb.0: ; %entry
731; GFX10W32-NEXT:    s_mov_b32 s1, exec_lo
732; GFX10W32-NEXT:    s_mov_b32 s0, 0
733; GFX10W32-NEXT:    ; implicit-def: $vgpr1
734; GFX10W32-NEXT:  .LBB2_1: ; %ComputeLoop
735; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
736; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
737; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
738; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
739; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
740; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
741; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
742; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
743; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
744; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
745; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
746; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
747; GFX10W32-NEXT:    ; implicit-def: $vgpr0
748; GFX10W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
749; GFX10W32-NEXT:    s_xor_b32 s1, exec_lo, s1
750; GFX10W32-NEXT:    s_cbranch_execz .LBB2_4
751; GFX10W32-NEXT:  ; %bb.3:
752; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
753; GFX10W32-NEXT:    v_mov_b32_e32 v0, s0
754; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
755; GFX10W32-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
756; GFX10W32-NEXT:  .LBB2_4:
757; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
758; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
759; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
760; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
761; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
762; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
763; GFX10W32-NEXT:    v_add_nc_u32_e32 v1, s2, v1
764; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
765; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
766; GFX10W32-NEXT:    s_endpgm
767;
768; GFX11W64-LABEL: add_i32_varying_vdata:
769; GFX11W64:       ; %bb.0: ; %entry
770; GFX11W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
771; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
772; GFX11W64-NEXT:    s_mov_b32 s2, 0
773; GFX11W64-NEXT:    ; implicit-def: $vgpr0
774; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
775; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
776; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
777; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
778; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
779; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
780; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
781; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
782; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
783; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
784; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
785; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
786; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
787; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
788; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
789; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
790; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
791; GFX11W64-NEXT:    ; implicit-def: $vgpr1
792; GFX11W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
793; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
794; GFX11W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
795; GFX11W64-NEXT:    s_cbranch_execz .LBB2_4
796; GFX11W64-NEXT:  ; %bb.3:
797; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
798; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
799; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
800; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
801; GFX11W64-NEXT:  .LBB2_4:
802; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
803; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
804; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
805; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
806; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
807; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
808; GFX11W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
809; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
810; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
811; GFX11W64-NEXT:    s_endpgm
812;
813; GFX11W32-LABEL: add_i32_varying_vdata:
814; GFX11W32:       ; %bb.0: ; %entry
815; GFX11W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
816; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
817; GFX11W32-NEXT:    s_mov_b32 s0, 0
818; GFX11W32-NEXT:    ; implicit-def: $vgpr0
819; GFX11W32-NEXT:  .LBB2_1: ; %ComputeLoop
820; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
821; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
822; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
823; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
824; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
825; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
826; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
827; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
828; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
829; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
830; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
831; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
832; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
833; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
834; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
835; GFX11W32-NEXT:    ; implicit-def: $vgpr1
836; GFX11W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
837; GFX11W32-NEXT:    s_xor_b32 s1, exec_lo, s1
838; GFX11W32-NEXT:    s_cbranch_execz .LBB2_4
839; GFX11W32-NEXT:  ; %bb.3:
840; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
841; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
842; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
843; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
844; GFX11W32-NEXT:  .LBB2_4:
845; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
846; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
847; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
848; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
849; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
850; GFX11W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
851; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
852; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
853; GFX11W32-NEXT:    s_endpgm
854;
855; GFX12W64-LABEL: add_i32_varying_vdata:
856; GFX12W64:       ; %bb.0: ; %entry
857; GFX12W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
858; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
859; GFX12W64-NEXT:    s_mov_b32 s2, 0
860; GFX12W64-NEXT:    ; implicit-def: $vgpr0
861; GFX12W64-NEXT:  .LBB2_1: ; %ComputeLoop
862; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
863; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
864; GFX12W64-NEXT:    s_wait_alu 0xfffe
865; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
866; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
867; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
868; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
869; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
870; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
871; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
872; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
873; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
874; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
875; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
876; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
877; GFX12W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
878; GFX12W64-NEXT:    ; implicit-def: $vgpr1
879; GFX12W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
880; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
881; GFX12W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
882; GFX12W64-NEXT:    s_cbranch_execz .LBB2_4
883; GFX12W64-NEXT:  ; %bb.3:
884; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
885; GFX12W64-NEXT:    s_wait_alu 0xfffe
886; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
887; GFX12W64-NEXT:    s_wait_kmcnt 0x0
888; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
889; GFX12W64-NEXT:  .LBB2_4:
890; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
891; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
892; GFX12W64-NEXT:    s_wait_loadcnt 0x0
893; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
894; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
895; GFX12W64-NEXT:    s_wait_alu 0xfffe
896; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
897; GFX12W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
898; GFX12W64-NEXT:    s_wait_kmcnt 0x0
899; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
900; GFX12W64-NEXT:    s_endpgm
901;
902; GFX12W32-LABEL: add_i32_varying_vdata:
903; GFX12W32:       ; %bb.0: ; %entry
904; GFX12W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
905; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
906; GFX12W32-NEXT:    s_mov_b32 s0, 0
907; GFX12W32-NEXT:    ; implicit-def: $vgpr0
908; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
909; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
910; GFX12W32-NEXT:    s_wait_alu 0xfffe
911; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
912; GFX12W32-NEXT:    s_wait_alu 0xfffe
913; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
914; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
915; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
916; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
917; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
918; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
919; GFX12W32-NEXT:    s_wait_alu 0xfffe
920; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
921; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
922; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
923; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
924; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
925; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
926; GFX12W32-NEXT:    ; implicit-def: $vgpr1
927; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
928; GFX12W32-NEXT:    s_wait_alu 0xfffe
929; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
930; GFX12W32-NEXT:    s_cbranch_execz .LBB2_4
931; GFX12W32-NEXT:  ; %bb.3:
932; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
933; GFX12W32-NEXT:    v_mov_b32_e32 v1, s0
934; GFX12W32-NEXT:    s_wait_kmcnt 0x0
935; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
936; GFX12W32-NEXT:  .LBB2_4:
937; GFX12W32-NEXT:    s_wait_alu 0xfffe
938; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
939; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
940; GFX12W32-NEXT:    s_wait_loadcnt 0x0
941; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
942; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
943; GFX12W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
944; GFX12W32-NEXT:    s_wait_kmcnt 0x0
945; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
946; GFX12W32-NEXT:    s_endpgm
947entry:
948  %lane = call i32 @llvm.amdgcn.workitem.id.x()
949  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
950  store i32 %old, ptr addrspace(1) %out
951  ret void
952}
953
954define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
955; GFX6-LABEL: add_i32_varying_offset:
956; GFX6:       ; %bb.0: ; %entry
957; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
958; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
959; GFX6-NEXT:    v_mov_b32_e32 v1, 1
960; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
961; GFX6-NEXT:    buffer_atomic_add v1, v0, s[0:3], 0 offen glc
962; GFX6-NEXT:    s_mov_b32 s7, 0xf000
963; GFX6-NEXT:    s_mov_b32 s6, -1
964; GFX6-NEXT:    s_waitcnt vmcnt(0)
965; GFX6-NEXT:    buffer_store_dword v1, off, s[4:7], 0
966; GFX6-NEXT:    s_endpgm
967;
968; GFX8-LABEL: add_i32_varying_offset:
969; GFX8:       ; %bb.0: ; %entry
970; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
971; GFX8-NEXT:    v_mov_b32_e32 v2, 1
972; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
973; GFX8-NEXT:    buffer_atomic_add v2, v0, s[0:3], 0 offen glc
974; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
975; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
976; GFX8-NEXT:    v_mov_b32_e32 v0, s0
977; GFX8-NEXT:    v_mov_b32_e32 v1, s1
978; GFX8-NEXT:    s_waitcnt vmcnt(0)
979; GFX8-NEXT:    flat_store_dword v[0:1], v2
980; GFX8-NEXT:    s_endpgm
981;
982; GFX9-LABEL: add_i32_varying_offset:
983; GFX9:       ; %bb.0: ; %entry
984; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
985; GFX9-NEXT:    v_mov_b32_e32 v1, 1
986; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
987; GFX9-NEXT:    buffer_atomic_add v1, v0, s[0:3], 0 offen glc
988; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
989; GFX9-NEXT:    v_mov_b32_e32 v0, 0
990; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
991; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
992; GFX9-NEXT:    s_endpgm
993;
994; GFX10-LABEL: add_i32_varying_offset:
995; GFX10:       ; %bb.0: ; %entry
996; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
997; GFX10-NEXT:    v_mov_b32_e32 v1, 1
998; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
999; GFX10-NEXT:    buffer_atomic_add v1, v0, s[0:3], 0 offen glc
1000; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
1001; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1002; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1003; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1004; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1005; GFX10-NEXT:    s_endpgm
1006;
1007; GFX11W64-LABEL: add_i32_varying_offset:
1008; GFX11W64:       ; %bb.0: ; %entry
1009; GFX11W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1010; GFX11W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1011; GFX11W64-NEXT:    v_mov_b32_e32 v1, 1
1012; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1013; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, v0, s[0:3], 0 offen glc
1014; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1015; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
1016; GFX11W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1017; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
1018; GFX11W64-NEXT:    s_endpgm
1019;
1020; GFX11W32-LABEL: add_i32_varying_offset:
1021; GFX11W32:       ; %bb.0: ; %entry
1022; GFX11W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1023; GFX11W32-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
1024; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1025; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, v0, s[0:3], 0 offen glc
1026; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1027; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
1028; GFX11W32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1029; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
1030; GFX11W32-NEXT:    s_endpgm
1031;
1032; GFX12W64-LABEL: add_i32_varying_offset:
1033; GFX12W64:       ; %bb.0: ; %entry
1034; GFX12W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1035; GFX12W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1036; GFX12W64-NEXT:    v_mov_b32_e32 v1, 1
1037; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1038; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN
1039; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1040; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
1041; GFX12W64-NEXT:    s_wait_loadcnt 0x0
1042; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1043; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
1044; GFX12W64-NEXT:    s_endpgm
1045;
1046; GFX12W32-LABEL: add_i32_varying_offset:
1047; GFX12W32:       ; %bb.0: ; %entry
1048; GFX12W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1049; GFX12W32-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
1050; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1051; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN
1052; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1053; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
1054; GFX12W32-NEXT:    s_wait_loadcnt 0x0
1055; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1056; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
1057; GFX12W32-NEXT:    s_endpgm
1058entry:
1059  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1060  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0)
1061  store i32 %old, ptr addrspace(1) %out
1062  ret void
1063}
1064
1065define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
1066; GFX6-LABEL: sub_i32_constant:
1067; GFX6:       ; %bb.0: ; %entry
1068; GFX6-NEXT:    s_mov_b64 s[2:3], exec
1069; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1070; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1071; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1072; GFX6-NEXT:    ; implicit-def: $vgpr1
1073; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1074; GFX6-NEXT:    s_cbranch_execz .LBB4_2
1075; GFX6-NEXT:  ; %bb.1:
1076; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
1077; GFX6-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1078; GFX6-NEXT:    s_mul_i32 s2, s2, 5
1079; GFX6-NEXT:    v_mov_b32_e32 v1, s2
1080; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1081; GFX6-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1082; GFX6-NEXT:  .LBB4_2:
1083; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
1084; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1085; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1086; GFX6-NEXT:    s_mov_b32 s2, -1
1087; GFX6-NEXT:    s_waitcnt vmcnt(0)
1088; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
1089; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1090; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1091; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1092; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1093; GFX6-NEXT:    s_endpgm
1094;
1095; GFX8-LABEL: sub_i32_constant:
1096; GFX8:       ; %bb.0: ; %entry
1097; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1098; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1099; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1100; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1101; GFX8-NEXT:    ; implicit-def: $vgpr1
1102; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1103; GFX8-NEXT:    s_cbranch_execz .LBB4_2
1104; GFX8-NEXT:  ; %bb.1:
1105; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1106; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1107; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1108; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1109; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1110; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1111; GFX8-NEXT:  .LBB4_2:
1112; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1113; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1114; GFX8-NEXT:    s_waitcnt vmcnt(0)
1115; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1116; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1117; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v0
1118; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1119; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1120; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1121; GFX8-NEXT:    flat_store_dword v[0:1], v2
1122; GFX8-NEXT:    s_endpgm
1123;
1124; GFX9-LABEL: sub_i32_constant:
1125; GFX9:       ; %bb.0: ; %entry
1126; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1127; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1128; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1129; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1130; GFX9-NEXT:    ; implicit-def: $vgpr1
1131; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1132; GFX9-NEXT:    s_cbranch_execz .LBB4_2
1133; GFX9-NEXT:  ; %bb.1:
1134; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1135; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1136; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1137; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1138; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1139; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1140; GFX9-NEXT:  .LBB4_2:
1141; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1142; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1143; GFX9-NEXT:    s_waitcnt vmcnt(0)
1144; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1145; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1146; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1147; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1148; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1149; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
1150; GFX9-NEXT:    s_endpgm
1151;
1152; GFX10W64-LABEL: sub_i32_constant:
1153; GFX10W64:       ; %bb.0: ; %entry
1154; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
1155; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1156; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1157; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1158; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1159; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1160; GFX10W64-NEXT:    s_cbranch_execz .LBB4_2
1161; GFX10W64-NEXT:  ; %bb.1:
1162; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1163; GFX10W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1164; GFX10W64-NEXT:    s_mul_i32 s2, s2, 5
1165; GFX10W64-NEXT:    v_mov_b32_e32 v1, s2
1166; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1167; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1168; GFX10W64-NEXT:  .LBB4_2:
1169; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1170; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1171; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1172; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1173; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
1174; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1175; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1176; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1177; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1178; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
1179; GFX10W64-NEXT:    s_endpgm
1180;
1181; GFX10W32-LABEL: sub_i32_constant:
1182; GFX10W32:       ; %bb.0: ; %entry
1183; GFX10W32-NEXT:    s_mov_b32 s1, exec_lo
1184; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1185; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1186; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1187; GFX10W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1188; GFX10W32-NEXT:    s_cbranch_execz .LBB4_2
1189; GFX10W32-NEXT:  ; %bb.1:
1190; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1191; GFX10W32-NEXT:    s_bcnt1_i32_b32 s1, s1
1192; GFX10W32-NEXT:    s_mul_i32 s1, s1, 5
1193; GFX10W32-NEXT:    v_mov_b32_e32 v1, s1
1194; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1195; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1196; GFX10W32-NEXT:  .LBB4_2:
1197; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1198; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1199; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1200; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1201; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
1202; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1203; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1204; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1205; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1206; GFX10W32-NEXT:    global_store_dword v1, v0, s[0:1]
1207; GFX10W32-NEXT:    s_endpgm
1208;
1209; GFX11W64-LABEL: sub_i32_constant:
1210; GFX11W64:       ; %bb.0: ; %entry
1211; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
1212; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
1213; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1214; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1215; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1216; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1217; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1218; GFX11W64-NEXT:    s_cbranch_execz .LBB4_2
1219; GFX11W64-NEXT:  ; %bb.1:
1220; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1221; GFX11W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1222; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1223; GFX11W64-NEXT:    s_mul_i32 s2, s2, 5
1224; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
1225; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1226; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1227; GFX11W64-NEXT:  .LBB4_2:
1228; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1229; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1230; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1231; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
1232; GFX11W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1233; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1234; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1235; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1236; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1237; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1238; GFX11W64-NEXT:    s_endpgm
1239;
1240; GFX11W32-LABEL: sub_i32_constant:
1241; GFX11W32:       ; %bb.0: ; %entry
1242; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
1243; GFX11W32-NEXT:    s_mov_b32 s0, exec_lo
1244; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1245; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1246; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1247; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1248; GFX11W32-NEXT:    s_cbranch_execz .LBB4_2
1249; GFX11W32-NEXT:  ; %bb.1:
1250; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1251; GFX11W32-NEXT:    s_bcnt1_i32_b32 s1, s1
1252; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1253; GFX11W32-NEXT:    s_mul_i32 s1, s1, 5
1254; GFX11W32-NEXT:    v_mov_b32_e32 v1, s1
1255; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1256; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1257; GFX11W32-NEXT:  .LBB4_2:
1258; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1259; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1260; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1261; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
1262; GFX11W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1263; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1264; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1265; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1266; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1267; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
1268; GFX11W32-NEXT:    s_endpgm
1269;
1270; GFX12W64-LABEL: sub_i32_constant:
1271; GFX12W64:       ; %bb.0: ; %entry
1272; GFX12W64-NEXT:    s_mov_b64 s[2:3], exec
1273; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
1274; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1275; GFX12W64-NEXT:    ; implicit-def: $vgpr1
1276; GFX12W64-NEXT:    s_wait_alu 0xfffe
1277; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1278; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1279; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1280; GFX12W64-NEXT:    s_cbranch_execz .LBB4_2
1281; GFX12W64-NEXT:  ; %bb.1:
1282; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1283; GFX12W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1284; GFX12W64-NEXT:    s_wait_alu 0xfffe
1285; GFX12W64-NEXT:    s_mul_i32 s2, s2, 5
1286; GFX12W64-NEXT:    s_wait_alu 0xfffe
1287; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
1288; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1289; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
1290; GFX12W64-NEXT:  .LBB4_2:
1291; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1292; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1293; GFX12W64-NEXT:    s_wait_loadcnt 0x0
1294; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
1295; GFX12W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1296; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
1297; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1298; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1299; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1300; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1301; GFX12W64-NEXT:    s_endpgm
1302;
1303; GFX12W32-LABEL: sub_i32_constant:
1304; GFX12W32:       ; %bb.0: ; %entry
1305; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
1306; GFX12W32-NEXT:    s_mov_b32 s0, exec_lo
1307; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1308; GFX12W32-NEXT:    ; implicit-def: $vgpr1
1309; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1310; GFX12W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1311; GFX12W32-NEXT:    s_cbranch_execz .LBB4_2
1312; GFX12W32-NEXT:  ; %bb.1:
1313; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1314; GFX12W32-NEXT:    s_wait_alu 0xfffe
1315; GFX12W32-NEXT:    s_bcnt1_i32_b32 s1, s1
1316; GFX12W32-NEXT:    s_wait_alu 0xfffe
1317; GFX12W32-NEXT:    s_mul_i32 s1, s1, 5
1318; GFX12W32-NEXT:    s_wait_alu 0xfffe
1319; GFX12W32-NEXT:    v_mov_b32_e32 v1, s1
1320; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1321; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
1322; GFX12W32-NEXT:  .LBB4_2:
1323; GFX12W32-NEXT:    s_wait_alu 0xfffe
1324; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1325; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1326; GFX12W32-NEXT:    s_wait_loadcnt 0x0
1327; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
1328; GFX12W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1329; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
1330; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1331; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1332; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1333; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
1334; GFX12W32-NEXT:    s_endpgm
1335entry:
1336  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
1337  store i32 %old, ptr addrspace(1) %out
1338  ret void
1339}
1340
1341define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) {
1342; GFX6-LABEL: sub_i32_uniform:
1343; GFX6:       ; %bb.0: ; %entry
1344; GFX6-NEXT:    s_mov_b64 s[2:3], exec
1345; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x11
1346; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1347; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1348; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1349; GFX6-NEXT:    ; implicit-def: $vgpr1
1350; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1351; GFX6-NEXT:    s_cbranch_execz .LBB5_2
1352; GFX6-NEXT:  ; %bb.1:
1353; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
1354; GFX6-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1355; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1356; GFX6-NEXT:    s_mul_i32 s2, s6, s2
1357; GFX6-NEXT:    v_mov_b32_e32 v1, s2
1358; GFX6-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1359; GFX6-NEXT:  .LBB5_2:
1360; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
1361; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1362; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1363; GFX6-NEXT:    s_mov_b32 s2, -1
1364; GFX6-NEXT:    s_waitcnt vmcnt(0)
1365; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
1366; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1367; GFX6-NEXT:    v_mul_lo_u32 v0, s6, v0
1368; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1369; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1370; GFX6-NEXT:    s_endpgm
1371;
1372; GFX8-LABEL: sub_i32_uniform:
1373; GFX8:       ; %bb.0: ; %entry
1374; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x44
1375; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1376; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1377; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1378; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1379; GFX8-NEXT:    ; implicit-def: $vgpr1
1380; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1381; GFX8-NEXT:    s_cbranch_execz .LBB5_2
1382; GFX8-NEXT:  ; %bb.1:
1383; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1384; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1385; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1386; GFX8-NEXT:    s_mul_i32 s2, s6, s2
1387; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1388; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1389; GFX8-NEXT:  .LBB5_2:
1390; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1391; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1392; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1393; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
1394; GFX8-NEXT:    s_waitcnt vmcnt(0)
1395; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1396; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v0
1397; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1398; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1399; GFX8-NEXT:    flat_store_dword v[0:1], v2
1400; GFX8-NEXT:    s_endpgm
1401;
1402; GFX9-LABEL: sub_i32_uniform:
1403; GFX9:       ; %bb.0: ; %entry
1404; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x44
1405; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1406; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1407; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1408; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1409; GFX9-NEXT:    ; implicit-def: $vgpr1
1410; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1411; GFX9-NEXT:    s_cbranch_execz .LBB5_2
1412; GFX9-NEXT:  ; %bb.1:
1413; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1414; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1415; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1416; GFX9-NEXT:    s_mul_i32 s2, s6, s2
1417; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1418; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1419; GFX9-NEXT:  .LBB5_2:
1420; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1421; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1422; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1423; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
1424; GFX9-NEXT:    s_waitcnt vmcnt(0)
1425; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1426; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1427; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1428; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
1429; GFX9-NEXT:    s_endpgm
1430;
1431; GFX10W64-LABEL: sub_i32_uniform:
1432; GFX10W64:       ; %bb.0: ; %entry
1433; GFX10W64-NEXT:    s_load_dword s6, s[4:5], 0x44
1434; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
1435; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1436; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1437; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1438; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1439; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1440; GFX10W64-NEXT:    s_cbranch_execz .LBB5_2
1441; GFX10W64-NEXT:  ; %bb.1:
1442; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1443; GFX10W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1444; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1445; GFX10W64-NEXT:    s_mul_i32 s2, s6, s2
1446; GFX10W64-NEXT:    v_mov_b32_e32 v1, s2
1447; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1448; GFX10W64-NEXT:  .LBB5_2:
1449; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1450; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1451; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1452; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1453; GFX10W64-NEXT:    v_mul_lo_u32 v0, s6, v0
1454; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1455; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
1456; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1457; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1458; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
1459; GFX10W64-NEXT:    s_endpgm
1460;
1461; GFX10W32-LABEL: sub_i32_uniform:
1462; GFX10W32:       ; %bb.0: ; %entry
1463; GFX10W32-NEXT:    s_load_dword s0, s[4:5], 0x44
1464; GFX10W32-NEXT:    s_mov_b32 s2, exec_lo
1465; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1466; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1467; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1468; GFX10W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1469; GFX10W32-NEXT:    s_cbranch_execz .LBB5_2
1470; GFX10W32-NEXT:  ; %bb.1:
1471; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1472; GFX10W32-NEXT:    s_bcnt1_i32_b32 s2, s2
1473; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1474; GFX10W32-NEXT:    s_mul_i32 s2, s0, s2
1475; GFX10W32-NEXT:    v_mov_b32_e32 v1, s2
1476; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1477; GFX10W32-NEXT:  .LBB5_2:
1478; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1479; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1480; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1481; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1482; GFX10W32-NEXT:    v_mul_lo_u32 v0, s0, v0
1483; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1484; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
1485; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1486; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1487; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
1488; GFX10W32-NEXT:    s_endpgm
1489;
1490; GFX11W64-LABEL: sub_i32_uniform:
1491; GFX11W64:       ; %bb.0: ; %entry
1492; GFX11W64-NEXT:    s_load_b32 s6, s[4:5], 0x44
1493; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
1494; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
1495; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1496; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1497; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1498; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1499; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1500; GFX11W64-NEXT:    s_cbranch_execz .LBB5_2
1501; GFX11W64-NEXT:  ; %bb.1:
1502; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1503; GFX11W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1504; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1505; GFX11W64-NEXT:    s_mul_i32 s2, s6, s2
1506; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1507; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
1508; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1509; GFX11W64-NEXT:  .LBB5_2:
1510; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1511; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1512; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1513; GFX11W64-NEXT:    v_mul_lo_u32 v0, s6, v0
1514; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1515; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
1516; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1517; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1518; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1519; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1520; GFX11W64-NEXT:    s_endpgm
1521;
1522; GFX11W32-LABEL: sub_i32_uniform:
1523; GFX11W32:       ; %bb.0: ; %entry
1524; GFX11W32-NEXT:    s_load_b32 s0, s[4:5], 0x44
1525; GFX11W32-NEXT:    s_mov_b32 s2, exec_lo
1526; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
1527; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1528; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1529; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1530; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1531; GFX11W32-NEXT:    s_cbranch_execz .LBB5_2
1532; GFX11W32-NEXT:  ; %bb.1:
1533; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1534; GFX11W32-NEXT:    s_bcnt1_i32_b32 s2, s2
1535; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1536; GFX11W32-NEXT:    s_mul_i32 s2, s0, s2
1537; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1538; GFX11W32-NEXT:    v_mov_b32_e32 v1, s2
1539; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1540; GFX11W32-NEXT:  .LBB5_2:
1541; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1542; GFX11W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
1543; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1544; GFX11W32-NEXT:    v_mul_lo_u32 v0, s0, v0
1545; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1546; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
1547; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1548; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1549; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1550; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
1551; GFX11W32-NEXT:    s_endpgm
1552;
1553; GFX12W64-LABEL: sub_i32_uniform:
1554; GFX12W64:       ; %bb.0: ; %entry
1555; GFX12W64-NEXT:    s_load_b32 s6, s[4:5], 0x44
1556; GFX12W64-NEXT:    s_mov_b64 s[2:3], exec
1557; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
1558; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1559; GFX12W64-NEXT:    ; implicit-def: $vgpr1
1560; GFX12W64-NEXT:    s_wait_alu 0xfffe
1561; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1562; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1563; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1564; GFX12W64-NEXT:    s_cbranch_execz .LBB5_2
1565; GFX12W64-NEXT:  ; %bb.1:
1566; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1567; GFX12W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1568; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1569; GFX12W64-NEXT:    s_wait_alu 0xfffe
1570; GFX12W64-NEXT:    s_mul_i32 s2, s6, s2
1571; GFX12W64-NEXT:    s_wait_alu 0xfffe
1572; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
1573; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
1574; GFX12W64-NEXT:  .LBB5_2:
1575; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1576; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1577; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1578; GFX12W64-NEXT:    v_mul_lo_u32 v0, s6, v0
1579; GFX12W64-NEXT:    s_wait_loadcnt 0x0
1580; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
1581; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
1582; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1583; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1584; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1585; GFX12W64-NEXT:    s_endpgm
1586;
1587; GFX12W32-LABEL: sub_i32_uniform:
1588; GFX12W32:       ; %bb.0: ; %entry
1589; GFX12W32-NEXT:    s_load_b32 s0, s[4:5], 0x44
1590; GFX12W32-NEXT:    s_mov_b32 s2, exec_lo
1591; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
1592; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1593; GFX12W32-NEXT:    ; implicit-def: $vgpr1
1594; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1595; GFX12W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1596; GFX12W32-NEXT:    s_cbranch_execz .LBB5_2
1597; GFX12W32-NEXT:  ; %bb.1:
1598; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1599; GFX12W32-NEXT:    s_wait_alu 0xfffe
1600; GFX12W32-NEXT:    s_bcnt1_i32_b32 s2, s2
1601; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1602; GFX12W32-NEXT:    s_wait_alu 0xfffe
1603; GFX12W32-NEXT:    s_mul_i32 s2, s0, s2
1604; GFX12W32-NEXT:    s_wait_alu 0xfffe
1605; GFX12W32-NEXT:    v_mov_b32_e32 v1, s2
1606; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
1607; GFX12W32-NEXT:  .LBB5_2:
1608; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1609; GFX12W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
1610; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1611; GFX12W32-NEXT:    v_mul_lo_u32 v0, s0, v0
1612; GFX12W32-NEXT:    s_wait_loadcnt 0x0
1613; GFX12W32-NEXT:    v_readfirstlane_b32 s0, v1
1614; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
1615; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1616; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1617; GFX12W32-NEXT:    global_store_b32 v1, v0, s[2:3]
1618; GFX12W32-NEXT:    s_endpgm
1619entry:
1620  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
1621  store i32 %old, ptr addrspace(1) %out
1622  ret void
1623}
1624
1625define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
1626; GFX6-LABEL: sub_i32_varying_vdata:
1627; GFX6:       ; %bb.0: ; %entry
1628; GFX6-NEXT:    s_mov_b64 s[0:1], exec
1629; GFX6-NEXT:    s_mov_b32 s2, 0
1630; GFX6-NEXT:    ; implicit-def: $vgpr1
1631; GFX6-NEXT:  .LBB6_1: ; %ComputeLoop
1632; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
1633; GFX6-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1634; GFX6-NEXT:    s_mov_b32 m0, s3
1635; GFX6-NEXT:    v_readlane_b32 s8, v0, s3
1636; GFX6-NEXT:    v_writelane_b32 v1, s2, m0
1637; GFX6-NEXT:    s_lshl_b64 s[6:7], 1, s3
1638; GFX6-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
1639; GFX6-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
1640; GFX6-NEXT:    s_and_b64 vcc, exec, s[6:7]
1641; GFX6-NEXT:    s_add_i32 s2, s2, s8
1642; GFX6-NEXT:    s_cbranch_vccnz .LBB6_1
1643; GFX6-NEXT:  ; %bb.2: ; %ComputeEnd
1644; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1645; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
1646; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1647; GFX6-NEXT:    ; implicit-def: $vgpr0
1648; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1649; GFX6-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1650; GFX6-NEXT:    s_cbranch_execz .LBB6_4
1651; GFX6-NEXT:  ; %bb.3:
1652; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
1653; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1654; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1655; GFX6-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1656; GFX6-NEXT:  .LBB6_4:
1657; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
1658; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1659; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1660; GFX6-NEXT:    s_mov_b32 s2, -1
1661; GFX6-NEXT:    s_waitcnt vmcnt(0)
1662; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
1663; GFX6-NEXT:    s_waitcnt expcnt(0)
1664; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v1
1665; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1666; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1667; GFX6-NEXT:    s_endpgm
1668;
1669; GFX8-LABEL: sub_i32_varying_vdata:
1670; GFX8:       ; %bb.0: ; %entry
1671; GFX8-NEXT:    s_mov_b64 s[0:1], exec
1672; GFX8-NEXT:    s_mov_b32 s2, 0
1673; GFX8-NEXT:    ; implicit-def: $vgpr1
1674; GFX8-NEXT:  .LBB6_1: ; %ComputeLoop
1675; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1676; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1677; GFX8-NEXT:    s_mov_b32 m0, s3
1678; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
1679; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
1680; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
1681; GFX8-NEXT:    s_add_i32 s2, s2, s8
1682; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
1683; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
1684; GFX8-NEXT:    s_cbranch_scc1 .LBB6_1
1685; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
1686; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1687; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1688; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1689; GFX8-NEXT:    ; implicit-def: $vgpr0
1690; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1691; GFX8-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1692; GFX8-NEXT:    s_cbranch_execz .LBB6_4
1693; GFX8-NEXT:  ; %bb.3:
1694; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1695; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1696; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1697; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1698; GFX8-NEXT:  .LBB6_4:
1699; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1700; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1701; GFX8-NEXT:    s_waitcnt vmcnt(0)
1702; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1703; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v1
1704; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1705; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1706; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1707; GFX8-NEXT:    flat_store_dword v[0:1], v2
1708; GFX8-NEXT:    s_endpgm
1709;
1710; GFX9-LABEL: sub_i32_varying_vdata:
1711; GFX9:       ; %bb.0: ; %entry
1712; GFX9-NEXT:    s_mov_b64 s[0:1], exec
1713; GFX9-NEXT:    s_mov_b32 s2, 0
1714; GFX9-NEXT:    ; implicit-def: $vgpr1
1715; GFX9-NEXT:  .LBB6_1: ; %ComputeLoop
1716; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
1717; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1718; GFX9-NEXT:    s_mov_b32 m0, s3
1719; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
1720; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
1721; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
1722; GFX9-NEXT:    s_add_i32 s2, s2, s8
1723; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
1724; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
1725; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
1726; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
1727; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1728; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1729; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1730; GFX9-NEXT:    ; implicit-def: $vgpr0
1731; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1732; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1733; GFX9-NEXT:    s_cbranch_execz .LBB6_4
1734; GFX9-NEXT:  ; %bb.3:
1735; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1736; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1737; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1738; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1739; GFX9-NEXT:  .LBB6_4:
1740; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1741; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1742; GFX9-NEXT:    s_waitcnt vmcnt(0)
1743; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1744; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1745; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v1
1746; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1747; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
1748; GFX9-NEXT:    s_endpgm
1749;
1750; GFX10W64-LABEL: sub_i32_varying_vdata:
1751; GFX10W64:       ; %bb.0: ; %entry
1752; GFX10W64-NEXT:    s_mov_b64 s[0:1], exec
1753; GFX10W64-NEXT:    s_mov_b32 s2, 0
1754; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1755; GFX10W64-NEXT:  .LBB6_1: ; %ComputeLoop
1756; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1757; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1758; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
1759; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
1760; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
1761; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
1762; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
1763; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
1764; GFX10W64-NEXT:    s_cbranch_scc1 .LBB6_1
1765; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
1766; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1767; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1768; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1769; GFX10W64-NEXT:    ; implicit-def: $vgpr0
1770; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1771; GFX10W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1772; GFX10W64-NEXT:    s_cbranch_execz .LBB6_4
1773; GFX10W64-NEXT:  ; %bb.3:
1774; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1775; GFX10W64-NEXT:    v_mov_b32_e32 v0, s2
1776; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1777; GFX10W64-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1778; GFX10W64-NEXT:  .LBB6_4:
1779; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1780; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1781; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1782; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1783; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
1784; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
1785; GFX10W64-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
1786; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1787; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
1788; GFX10W64-NEXT:    s_endpgm
1789;
1790; GFX10W32-LABEL: sub_i32_varying_vdata:
1791; GFX10W32:       ; %bb.0: ; %entry
1792; GFX10W32-NEXT:    s_mov_b32 s1, exec_lo
1793; GFX10W32-NEXT:    s_mov_b32 s0, 0
1794; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1795; GFX10W32-NEXT:  .LBB6_1: ; %ComputeLoop
1796; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1797; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
1798; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
1799; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
1800; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
1801; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
1802; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
1803; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
1804; GFX10W32-NEXT:    s_cbranch_scc1 .LBB6_1
1805; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
1806; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1807; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1808; GFX10W32-NEXT:    ; implicit-def: $vgpr0
1809; GFX10W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1810; GFX10W32-NEXT:    s_xor_b32 s1, exec_lo, s1
1811; GFX10W32-NEXT:    s_cbranch_execz .LBB6_4
1812; GFX10W32-NEXT:  ; %bb.3:
1813; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1814; GFX10W32-NEXT:    v_mov_b32_e32 v0, s0
1815; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1816; GFX10W32-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1817; GFX10W32-NEXT:  .LBB6_4:
1818; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1819; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1820; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1821; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1822; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
1823; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
1824; GFX10W32-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
1825; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1826; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
1827; GFX10W32-NEXT:    s_endpgm
1828;
1829; GFX11W64-LABEL: sub_i32_varying_vdata:
1830; GFX11W64:       ; %bb.0: ; %entry
1831; GFX11W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
1832; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
1833; GFX11W64-NEXT:    s_mov_b32 s2, 0
1834; GFX11W64-NEXT:    ; implicit-def: $vgpr0
1835; GFX11W64-NEXT:  .LBB6_1: ; %ComputeLoop
1836; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1837; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
1838; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1839; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
1840; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
1841; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
1842; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
1843; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1844; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
1845; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
1846; GFX11W64-NEXT:    s_cbranch_scc1 .LBB6_1
1847; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
1848; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
1849; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1850; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
1851; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1852; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1853; GFX11W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1854; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1855; GFX11W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1856; GFX11W64-NEXT:    s_cbranch_execz .LBB6_4
1857; GFX11W64-NEXT:  ; %bb.3:
1858; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1859; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
1860; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1861; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1862; GFX11W64-NEXT:  .LBB6_4:
1863; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1864; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1865; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1866; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
1867; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1868; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1869; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1870; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1871; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1872; GFX11W64-NEXT:    s_endpgm
1873;
1874; GFX11W32-LABEL: sub_i32_varying_vdata:
1875; GFX11W32:       ; %bb.0: ; %entry
1876; GFX11W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
1877; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
1878; GFX11W32-NEXT:    s_mov_b32 s0, 0
1879; GFX11W32-NEXT:    ; implicit-def: $vgpr0
1880; GFX11W32-NEXT:  .LBB6_1: ; %ComputeLoop
1881; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1882; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
1883; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1884; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
1885; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
1886; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
1887; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
1888; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1889; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
1890; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
1891; GFX11W32-NEXT:    s_cbranch_scc1 .LBB6_1
1892; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
1893; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
1894; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1895; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
1896; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1897; GFX11W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1898; GFX11W32-NEXT:    s_xor_b32 s1, exec_lo, s1
1899; GFX11W32-NEXT:    s_cbranch_execz .LBB6_4
1900; GFX11W32-NEXT:  ; %bb.3:
1901; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1902; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
1903; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1904; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1905; GFX11W32-NEXT:  .LBB6_4:
1906; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1907; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1908; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1909; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
1910; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1911; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1912; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1913; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1914; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
1915; GFX11W32-NEXT:    s_endpgm
1916;
1917; GFX12W64-LABEL: sub_i32_varying_vdata:
1918; GFX12W64:       ; %bb.0: ; %entry
1919; GFX12W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
1920; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
1921; GFX12W64-NEXT:    s_mov_b32 s2, 0
1922; GFX12W64-NEXT:    ; implicit-def: $vgpr0
1923; GFX12W64-NEXT:  .LBB6_1: ; %ComputeLoop
1924; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1925; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
1926; GFX12W64-NEXT:    s_wait_alu 0xfffe
1927; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
1928; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
1929; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
1930; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
1931; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1932; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
1933; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
1934; GFX12W64-NEXT:    s_cbranch_scc1 .LBB6_1
1935; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
1936; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
1937; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1938; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
1939; GFX12W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1940; GFX12W64-NEXT:    ; implicit-def: $vgpr1
1941; GFX12W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1942; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1943; GFX12W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1944; GFX12W64-NEXT:    s_cbranch_execz .LBB6_4
1945; GFX12W64-NEXT:  ; %bb.3:
1946; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1947; GFX12W64-NEXT:    s_wait_alu 0xfffe
1948; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
1949; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1950; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
1951; GFX12W64-NEXT:  .LBB6_4:
1952; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1953; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1954; GFX12W64-NEXT:    s_wait_loadcnt 0x0
1955; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
1956; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
1957; GFX12W64-NEXT:    s_wait_alu 0xfffe
1958; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1959; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1960; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1961; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1962; GFX12W64-NEXT:    s_endpgm
1963;
1964; GFX12W32-LABEL: sub_i32_varying_vdata:
1965; GFX12W32:       ; %bb.0: ; %entry
1966; GFX12W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
1967; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
1968; GFX12W32-NEXT:    s_mov_b32 s0, 0
1969; GFX12W32-NEXT:    ; implicit-def: $vgpr0
1970; GFX12W32-NEXT:  .LBB6_1: ; %ComputeLoop
1971; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1972; GFX12W32-NEXT:    s_wait_alu 0xfffe
1973; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
1974; GFX12W32-NEXT:    s_wait_alu 0xfffe
1975; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
1976; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
1977; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
1978; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
1979; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1980; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
1981; GFX12W32-NEXT:    s_wait_alu 0xfffe
1982; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
1983; GFX12W32-NEXT:    s_cbranch_scc1 .LBB6_1
1984; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
1985; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
1986; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1987; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
1988; GFX12W32-NEXT:    ; implicit-def: $vgpr1
1989; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1990; GFX12W32-NEXT:    s_wait_alu 0xfffe
1991; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
1992; GFX12W32-NEXT:    s_cbranch_execz .LBB6_4
1993; GFX12W32-NEXT:  ; %bb.3:
1994; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1995; GFX12W32-NEXT:    v_mov_b32_e32 v1, s0
1996; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1997; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
1998; GFX12W32-NEXT:  .LBB6_4:
1999; GFX12W32-NEXT:    s_wait_alu 0xfffe
2000; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2001; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2002; GFX12W32-NEXT:    s_wait_loadcnt 0x0
2003; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
2004; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
2005; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2006; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
2007; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2008; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
2009; GFX12W32-NEXT:    s_endpgm
2010entry:
2011  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2012  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
2013  store i32 %old, ptr addrspace(1) %out
2014  ret void
2015}
2016
2017define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
2018; GFX6-LABEL: sub_i32_varying_offset:
2019; GFX6:       ; %bb.0: ; %entry
2020; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
2021; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
2022; GFX6-NEXT:    v_mov_b32_e32 v1, 1
2023; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2024; GFX6-NEXT:    buffer_atomic_sub v1, v0, s[0:3], 0 offen glc
2025; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2026; GFX6-NEXT:    s_mov_b32 s6, -1
2027; GFX6-NEXT:    s_waitcnt vmcnt(0)
2028; GFX6-NEXT:    buffer_store_dword v1, off, s[4:7], 0
2029; GFX6-NEXT:    s_endpgm
2030;
2031; GFX8-LABEL: sub_i32_varying_offset:
2032; GFX8:       ; %bb.0: ; %entry
2033; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
2034; GFX8-NEXT:    v_mov_b32_e32 v2, 1
2035; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2036; GFX8-NEXT:    buffer_atomic_sub v2, v0, s[0:3], 0 offen glc
2037; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2038; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2039; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2040; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2041; GFX8-NEXT:    s_waitcnt vmcnt(0)
2042; GFX8-NEXT:    flat_store_dword v[0:1], v2
2043; GFX8-NEXT:    s_endpgm
2044;
2045; GFX9-LABEL: sub_i32_varying_offset:
2046; GFX9:       ; %bb.0: ; %entry
2047; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
2048; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2049; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2050; GFX9-NEXT:    buffer_atomic_sub v1, v0, s[0:3], 0 offen glc
2051; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2052; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2053; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2054; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2055; GFX9-NEXT:    s_endpgm
2056;
2057; GFX10-LABEL: sub_i32_varying_offset:
2058; GFX10:       ; %bb.0: ; %entry
2059; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
2060; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2061; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2062; GFX10-NEXT:    buffer_atomic_sub v1, v0, s[0:3], 0 offen glc
2063; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2064; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2065; GFX10-NEXT:    v_mov_b32_e32 v0, 0
2066; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2067; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2068; GFX10-NEXT:    s_endpgm
2069;
2070; GFX11W64-LABEL: sub_i32_varying_offset:
2071; GFX11W64:       ; %bb.0: ; %entry
2072; GFX11W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2073; GFX11W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2074; GFX11W64-NEXT:    v_mov_b32_e32 v1, 1
2075; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
2076; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, v0, s[0:3], 0 offen glc
2077; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2078; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
2079; GFX11W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2080; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
2081; GFX11W64-NEXT:    s_endpgm
2082;
2083; GFX11W32-LABEL: sub_i32_varying_offset:
2084; GFX11W32:       ; %bb.0: ; %entry
2085; GFX11W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2086; GFX11W32-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
2087; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
2088; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, v0, s[0:3], 0 offen glc
2089; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2090; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
2091; GFX11W32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2092; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
2093; GFX11W32-NEXT:    s_endpgm
2094;
2095; GFX12W64-LABEL: sub_i32_varying_offset:
2096; GFX12W64:       ; %bb.0: ; %entry
2097; GFX12W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2098; GFX12W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2099; GFX12W64-NEXT:    v_mov_b32_e32 v1, 1
2100; GFX12W64-NEXT:    s_wait_kmcnt 0x0
2101; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN
2102; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2103; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
2104; GFX12W64-NEXT:    s_wait_loadcnt 0x0
2105; GFX12W64-NEXT:    s_wait_kmcnt 0x0
2106; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
2107; GFX12W64-NEXT:    s_endpgm
2108;
2109; GFX12W32-LABEL: sub_i32_varying_offset:
2110; GFX12W32:       ; %bb.0: ; %entry
2111; GFX12W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2112; GFX12W32-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
2113; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2114; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN
2115; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2116; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
2117; GFX12W32-NEXT:    s_wait_loadcnt 0x0
2118; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2119; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
2120; GFX12W32-NEXT:    s_endpgm
2121entry:
2122  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2123  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0)
2124  store i32 %old, ptr addrspace(1) %out
2125  ret void
2126}
2127;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2128; GFX11: {{.*}}
2129; GFX12: {{.*}}
2130