xref: /llvm-project/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
9; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
10; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
11
12declare i32 @llvm.amdgcn.workitem.id.x()
13declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
14declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32, i32 immarg)
15declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32, ptr addrspace(8), i32, i32, i32 immarg)
16
17; Show what the atomic optimization pass will do for raw buffers.
18
19define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
20; GFX6-LABEL: add_i32_constant:
21; GFX6:       ; %bb.0: ; %entry
22; GFX6-NEXT:    s_mov_b64 s[2:3], exec
23; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
24; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
25; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
26; GFX6-NEXT:    ; implicit-def: $vgpr1
27; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
28; GFX6-NEXT:    s_cbranch_execz .LBB0_2
29; GFX6-NEXT:  ; %bb.1:
30; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
31; GFX6-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
32; GFX6-NEXT:    s_mul_i32 s2, s2, 5
33; GFX6-NEXT:    v_mov_b32_e32 v1, s2
34; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX6-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
36; GFX6-NEXT:  .LBB0_2:
37; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
38; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
39; GFX6-NEXT:    s_mov_b32 s3, 0xf000
40; GFX6-NEXT:    s_mov_b32 s2, -1
41; GFX6-NEXT:    s_waitcnt vmcnt(0)
42; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
43; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
44; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
45; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
46; GFX6-NEXT:    s_endpgm
47;
48; GFX8-LABEL: add_i32_constant:
49; GFX8:       ; %bb.0: ; %entry
50; GFX8-NEXT:    s_mov_b64 s[2:3], exec
51; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
52; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
53; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
54; GFX8-NEXT:    ; implicit-def: $vgpr1
55; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
56; GFX8-NEXT:    s_cbranch_execz .LBB0_2
57; GFX8-NEXT:  ; %bb.1:
58; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
59; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
60; GFX8-NEXT:    s_mul_i32 s2, s2, 5
61; GFX8-NEXT:    v_mov_b32_e32 v1, s2
62; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
63; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
64; GFX8-NEXT:  .LBB0_2:
65; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
66; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
67; GFX8-NEXT:    s_waitcnt vmcnt(0)
68; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
69; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s2
70; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX8-NEXT:    v_mov_b32_e32 v0, s0
72; GFX8-NEXT:    v_mov_b32_e32 v1, s1
73; GFX8-NEXT:    flat_store_dword v[0:1], v2
74; GFX8-NEXT:    s_endpgm
75;
76; GFX9-LABEL: add_i32_constant:
77; GFX9:       ; %bb.0: ; %entry
78; GFX9-NEXT:    s_mov_b64 s[2:3], exec
79; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
80; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
81; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
82; GFX9-NEXT:    ; implicit-def: $vgpr1
83; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
84; GFX9-NEXT:    s_cbranch_execz .LBB0_2
85; GFX9-NEXT:  ; %bb.1:
86; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
87; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
88; GFX9-NEXT:    s_mul_i32 s2, s2, 5
89; GFX9-NEXT:    v_mov_b32_e32 v1, s2
90; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
91; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
92; GFX9-NEXT:  .LBB0_2:
93; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
94; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
95; GFX9-NEXT:    s_waitcnt vmcnt(0)
96; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
97; GFX9-NEXT:    v_mov_b32_e32 v2, 0
98; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
99; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
100; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
101; GFX9-NEXT:    s_endpgm
102;
103; GFX10W64-LABEL: add_i32_constant:
104; GFX10W64:       ; %bb.0: ; %entry
105; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
106; GFX10W64-NEXT:    ; implicit-def: $vgpr1
107; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
108; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
109; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
110; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
111; GFX10W64-NEXT:    s_cbranch_execz .LBB0_2
112; GFX10W64-NEXT:  ; %bb.1:
113; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
114; GFX10W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
115; GFX10W64-NEXT:    s_mul_i32 s2, s2, 5
116; GFX10W64-NEXT:    v_mov_b32_e32 v1, s2
117; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
119; GFX10W64-NEXT:  .LBB0_2:
120; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
121; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
122; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
123; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
124; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
125; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
126; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
127; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
129; GFX10W64-NEXT:    s_endpgm
130;
131; GFX10W32-LABEL: add_i32_constant:
132; GFX10W32:       ; %bb.0: ; %entry
133; GFX10W32-NEXT:    s_mov_b32 s1, exec_lo
134; GFX10W32-NEXT:    ; implicit-def: $vgpr1
135; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
136; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
137; GFX10W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
138; GFX10W32-NEXT:    s_cbranch_execz .LBB0_2
139; GFX10W32-NEXT:  ; %bb.1:
140; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
141; GFX10W32-NEXT:    s_bcnt1_i32_b32 s1, s1
142; GFX10W32-NEXT:    s_mul_i32 s1, s1, 5
143; GFX10W32-NEXT:    v_mov_b32_e32 v1, s1
144; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
145; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
146; GFX10W32-NEXT:  .LBB0_2:
147; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
148; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
149; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
150; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
151; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
152; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
153; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
154; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
155; GFX10W32-NEXT:    global_store_dword v1, v0, s[0:1]
156; GFX10W32-NEXT:    s_endpgm
157;
158; GFX11W64-LABEL: add_i32_constant:
159; GFX11W64:       ; %bb.0: ; %entry
160; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
161; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
162; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
163; GFX11W64-NEXT:    ; implicit-def: $vgpr1
164; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
165; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
166; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
167; GFX11W64-NEXT:    s_cbranch_execz .LBB0_2
168; GFX11W64-NEXT:  ; %bb.1:
169; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
170; GFX11W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
171; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
172; GFX11W64-NEXT:    s_mul_i32 s2, s2, 5
173; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
174; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
175; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
176; GFX11W64-NEXT:  .LBB0_2:
177; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
178; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
179; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
180; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
181; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
182; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
183; GFX11W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
184; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
185; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
186; GFX11W64-NEXT:    s_endpgm
187;
188; GFX11W32-LABEL: add_i32_constant:
189; GFX11W32:       ; %bb.0: ; %entry
190; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
191; GFX11W32-NEXT:    s_mov_b32 s0, exec_lo
192; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
193; GFX11W32-NEXT:    ; implicit-def: $vgpr1
194; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
195; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
196; GFX11W32-NEXT:    s_cbranch_execz .LBB0_2
197; GFX11W32-NEXT:  ; %bb.1:
198; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
199; GFX11W32-NEXT:    s_bcnt1_i32_b32 s1, s1
200; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
201; GFX11W32-NEXT:    s_mul_i32 s1, s1, 5
202; GFX11W32-NEXT:    v_mov_b32_e32 v1, s1
203; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
205; GFX11W32-NEXT:  .LBB0_2:
206; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
207; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
208; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
209; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
210; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
211; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
212; GFX11W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
213; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
214; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
215; GFX11W32-NEXT:    s_endpgm
216;
217; GFX12W64-LABEL: add_i32_constant:
218; GFX12W64:       ; %bb.0: ; %entry
219; GFX12W64-NEXT:    s_mov_b64 s[2:3], exec
220; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
221; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
222; GFX12W64-NEXT:    ; implicit-def: $vgpr1
223; GFX12W64-NEXT:    s_wait_alu 0xfffe
224; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
225; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
226; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
227; GFX12W64-NEXT:    s_cbranch_execz .LBB0_2
228; GFX12W64-NEXT:  ; %bb.1:
229; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
230; GFX12W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
231; GFX12W64-NEXT:    s_wait_alu 0xfffe
232; GFX12W64-NEXT:    s_mul_i32 s2, s2, 5
233; GFX12W64-NEXT:    s_wait_alu 0xfffe
234; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
235; GFX12W64-NEXT:    s_wait_kmcnt 0x0
236; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
237; GFX12W64-NEXT:  .LBB0_2:
238; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
239; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
240; GFX12W64-NEXT:    s_wait_loadcnt 0x0
241; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
242; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
243; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
244; GFX12W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
245; GFX12W64-NEXT:    s_wait_kmcnt 0x0
246; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
247; GFX12W64-NEXT:    s_endpgm
248;
249; GFX12W32-LABEL: add_i32_constant:
250; GFX12W32:       ; %bb.0: ; %entry
251; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
252; GFX12W32-NEXT:    s_mov_b32 s0, exec_lo
253; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
254; GFX12W32-NEXT:    ; implicit-def: $vgpr1
255; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
256; GFX12W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
257; GFX12W32-NEXT:    s_cbranch_execz .LBB0_2
258; GFX12W32-NEXT:  ; %bb.1:
259; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
260; GFX12W32-NEXT:    s_wait_alu 0xfffe
261; GFX12W32-NEXT:    s_bcnt1_i32_b32 s1, s1
262; GFX12W32-NEXT:    s_wait_alu 0xfffe
263; GFX12W32-NEXT:    s_mul_i32 s1, s1, 5
264; GFX12W32-NEXT:    s_wait_alu 0xfffe
265; GFX12W32-NEXT:    v_mov_b32_e32 v1, s1
266; GFX12W32-NEXT:    s_wait_kmcnt 0x0
267; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
268; GFX12W32-NEXT:  .LBB0_2:
269; GFX12W32-NEXT:    s_wait_alu 0xfffe
270; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
271; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
272; GFX12W32-NEXT:    s_wait_loadcnt 0x0
273; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
274; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
275; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
276; GFX12W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
277; GFX12W32-NEXT:    s_wait_kmcnt 0x0
278; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
279; GFX12W32-NEXT:    s_endpgm
280entry:
281  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
282  store i32 %old, ptr addrspace(1) %out
283  ret void
284}
285
286define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) {
287; GFX6-LABEL: add_i32_uniform:
288; GFX6:       ; %bb.0: ; %entry
289; GFX6-NEXT:    s_mov_b64 s[2:3], exec
290; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x11
291; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
292; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
293; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
294; GFX6-NEXT:    ; implicit-def: $vgpr1
295; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
296; GFX6-NEXT:    s_cbranch_execz .LBB1_2
297; GFX6-NEXT:  ; %bb.1:
298; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
299; GFX6-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
300; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
301; GFX6-NEXT:    s_mul_i32 s2, s6, s2
302; GFX6-NEXT:    v_mov_b32_e32 v1, s2
303; GFX6-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
304; GFX6-NEXT:  .LBB1_2:
305; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
306; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
307; GFX6-NEXT:    s_mov_b32 s3, 0xf000
308; GFX6-NEXT:    s_mov_b32 s2, -1
309; GFX6-NEXT:    s_waitcnt vmcnt(0)
310; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
311; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
312; GFX6-NEXT:    v_mul_lo_u32 v0, s6, v0
313; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
314; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
315; GFX6-NEXT:    s_endpgm
316;
317; GFX8-LABEL: add_i32_uniform:
318; GFX8:       ; %bb.0: ; %entry
319; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x44
320; GFX8-NEXT:    s_mov_b64 s[2:3], exec
321; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
322; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
323; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
324; GFX8-NEXT:    ; implicit-def: $vgpr1
325; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
326; GFX8-NEXT:    s_cbranch_execz .LBB1_2
327; GFX8-NEXT:  ; %bb.1:
328; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
329; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
330; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
331; GFX8-NEXT:    s_mul_i32 s2, s6, s2
332; GFX8-NEXT:    v_mov_b32_e32 v1, s2
333; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
334; GFX8-NEXT:  .LBB1_2:
335; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
336; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
337; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
338; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
339; GFX8-NEXT:    s_waitcnt vmcnt(0)
340; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
341; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v0
342; GFX8-NEXT:    v_mov_b32_e32 v0, s0
343; GFX8-NEXT:    v_mov_b32_e32 v1, s1
344; GFX8-NEXT:    flat_store_dword v[0:1], v2
345; GFX8-NEXT:    s_endpgm
346;
347; GFX9-LABEL: add_i32_uniform:
348; GFX9:       ; %bb.0: ; %entry
349; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x44
350; GFX9-NEXT:    s_mov_b64 s[2:3], exec
351; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
352; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
353; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
354; GFX9-NEXT:    ; implicit-def: $vgpr1
355; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
356; GFX9-NEXT:    s_cbranch_execz .LBB1_2
357; GFX9-NEXT:  ; %bb.1:
358; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
359; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
360; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
361; GFX9-NEXT:    s_mul_i32 s2, s6, s2
362; GFX9-NEXT:    v_mov_b32_e32 v1, s2
363; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
364; GFX9-NEXT:  .LBB1_2:
365; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
366; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
367; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
368; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
369; GFX9-NEXT:    s_waitcnt vmcnt(0)
370; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
371; GFX9-NEXT:    v_mov_b32_e32 v2, 0
372; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
373; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
374; GFX9-NEXT:    s_endpgm
375;
376; GFX10W64-LABEL: add_i32_uniform:
377; GFX10W64:       ; %bb.0: ; %entry
378; GFX10W64-NEXT:    s_load_dword s6, s[4:5], 0x44
379; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
380; GFX10W64-NEXT:    ; implicit-def: $vgpr1
381; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
382; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
383; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
384; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
385; GFX10W64-NEXT:    s_cbranch_execz .LBB1_2
386; GFX10W64-NEXT:  ; %bb.1:
387; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
388; GFX10W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
389; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
390; GFX10W64-NEXT:    s_mul_i32 s2, s6, s2
391; GFX10W64-NEXT:    v_mov_b32_e32 v1, s2
392; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
393; GFX10W64-NEXT:  .LBB1_2:
394; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
395; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
396; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
397; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
398; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
399; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
400; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
401; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
402; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
403; GFX10W64-NEXT:    s_endpgm
404;
405; GFX10W32-LABEL: add_i32_uniform:
406; GFX10W32:       ; %bb.0: ; %entry
407; GFX10W32-NEXT:    s_load_dword s0, s[4:5], 0x44
408; GFX10W32-NEXT:    s_mov_b32 s2, exec_lo
409; GFX10W32-NEXT:    ; implicit-def: $vgpr1
410; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
411; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
412; GFX10W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
413; GFX10W32-NEXT:    s_cbranch_execz .LBB1_2
414; GFX10W32-NEXT:  ; %bb.1:
415; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
416; GFX10W32-NEXT:    s_bcnt1_i32_b32 s2, s2
417; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
418; GFX10W32-NEXT:    s_mul_i32 s2, s0, s2
419; GFX10W32-NEXT:    v_mov_b32_e32 v1, s2
420; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
421; GFX10W32-NEXT:  .LBB1_2:
422; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
423; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
424; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
425; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
426; GFX10W32-NEXT:    s_mov_b32 null, 0
427; GFX10W32-NEXT:    v_readfirstlane_b32 s4, v1
428; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
429; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5]
430; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
431; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
432; GFX10W32-NEXT:    s_endpgm
433;
434; GFX11W64-LABEL: add_i32_uniform:
435; GFX11W64:       ; %bb.0: ; %entry
436; GFX11W64-NEXT:    s_load_b32 s6, s[4:5], 0x44
437; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
438; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
439; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
440; GFX11W64-NEXT:    ; implicit-def: $vgpr1
441; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
442; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
443; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
444; GFX11W64-NEXT:    s_cbranch_execz .LBB1_2
445; GFX11W64-NEXT:  ; %bb.1:
446; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
447; GFX11W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
448; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX11W64-NEXT:    s_mul_i32 s2, s6, s2
450; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
451; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
452; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
453; GFX11W64-NEXT:  .LBB1_2:
454; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
455; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
456; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
457; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
458; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
459; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
460; GFX11W64-NEXT:    v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
461; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
462; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
463; GFX11W64-NEXT:    s_endpgm
464;
465; GFX11W32-LABEL: add_i32_uniform:
466; GFX11W32:       ; %bb.0: ; %entry
467; GFX11W32-NEXT:    s_load_b32 s0, s[4:5], 0x44
468; GFX11W32-NEXT:    s_mov_b32 s2, exec_lo
469; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
470; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
471; GFX11W32-NEXT:    ; implicit-def: $vgpr1
472; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
473; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
474; GFX11W32-NEXT:    s_cbranch_execz .LBB1_2
475; GFX11W32-NEXT:  ; %bb.1:
476; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
477; GFX11W32-NEXT:    s_bcnt1_i32_b32 s2, s2
478; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX11W32-NEXT:    s_mul_i32 s2, s0, s2
480; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
481; GFX11W32-NEXT:    v_mov_b32_e32 v1, s2
482; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
483; GFX11W32-NEXT:  .LBB1_2:
484; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
485; GFX11W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
486; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
487; GFX11W32-NEXT:    v_readfirstlane_b32 s4, v1
488; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
489; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
490; GFX11W32-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5]
491; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
492; GFX11W32-NEXT:    global_store_b32 v0, v1, s[2:3]
493; GFX11W32-NEXT:    s_endpgm
494;
495; GFX12W64-LABEL: add_i32_uniform:
496; GFX12W64:       ; %bb.0: ; %entry
497; GFX12W64-NEXT:    s_load_b32 s6, s[4:5], 0x44
498; GFX12W64-NEXT:    s_mov_b64 s[2:3], exec
499; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
500; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
501; GFX12W64-NEXT:    ; implicit-def: $vgpr1
502; GFX12W64-NEXT:    s_wait_alu 0xfffe
503; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
504; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
505; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
506; GFX12W64-NEXT:    s_cbranch_execz .LBB1_2
507; GFX12W64-NEXT:  ; %bb.1:
508; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
509; GFX12W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
510; GFX12W64-NEXT:    s_wait_kmcnt 0x0
511; GFX12W64-NEXT:    s_wait_alu 0xfffe
512; GFX12W64-NEXT:    s_mul_i32 s2, s6, s2
513; GFX12W64-NEXT:    s_wait_alu 0xfffe
514; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
515; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
516; GFX12W64-NEXT:  .LBB1_2:
517; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
518; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
519; GFX12W64-NEXT:    s_wait_loadcnt 0x0
520; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
521; GFX12W64-NEXT:    s_wait_kmcnt 0x0
522; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
523; GFX12W64-NEXT:    v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
524; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
525; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
526; GFX12W64-NEXT:    s_endpgm
527;
528; GFX12W32-LABEL: add_i32_uniform:
529; GFX12W32:       ; %bb.0: ; %entry
530; GFX12W32-NEXT:    s_load_b32 s0, s[4:5], 0x44
531; GFX12W32-NEXT:    s_mov_b32 s2, exec_lo
532; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
533; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
534; GFX12W32-NEXT:    ; implicit-def: $vgpr1
535; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
536; GFX12W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
537; GFX12W32-NEXT:    s_cbranch_execz .LBB1_2
538; GFX12W32-NEXT:  ; %bb.1:
539; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
540; GFX12W32-NEXT:    s_wait_alu 0xfffe
541; GFX12W32-NEXT:    s_bcnt1_i32_b32 s2, s2
542; GFX12W32-NEXT:    s_wait_kmcnt 0x0
543; GFX12W32-NEXT:    s_wait_alu 0xfffe
544; GFX12W32-NEXT:    s_mul_i32 s2, s0, s2
545; GFX12W32-NEXT:    s_wait_alu 0xfffe
546; GFX12W32-NEXT:    v_mov_b32_e32 v1, s2
547; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
548; GFX12W32-NEXT:  .LBB1_2:
549; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
550; GFX12W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
551; GFX12W32-NEXT:    s_wait_loadcnt 0x0
552; GFX12W32-NEXT:    v_readfirstlane_b32 s4, v1
553; GFX12W32-NEXT:    s_wait_kmcnt 0x0
554; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
555; GFX12W32-NEXT:    v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
556; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
557; GFX12W32-NEXT:    global_store_b32 v1, v0, s[2:3]
558; GFX12W32-NEXT:    s_endpgm
559entry:
560  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
561  store i32 %old, ptr addrspace(1) %out
562  ret void
563}
564
565define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
566; GFX6-LABEL: add_i32_varying_vdata:
567; GFX6:       ; %bb.0: ; %entry
568; GFX6-NEXT:    s_mov_b64 s[0:1], exec
569; GFX6-NEXT:    s_mov_b32 s2, 0
570; GFX6-NEXT:    ; implicit-def: $vgpr1
571; GFX6-NEXT:  .LBB2_1: ; %ComputeLoop
572; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
573; GFX6-NEXT:    s_ff1_i32_b64 s3, s[0:1]
574; GFX6-NEXT:    s_mov_b32 m0, s3
575; GFX6-NEXT:    v_readlane_b32 s8, v0, s3
576; GFX6-NEXT:    v_writelane_b32 v1, s2, m0
577; GFX6-NEXT:    s_lshl_b64 s[6:7], 1, s3
578; GFX6-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
579; GFX6-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
580; GFX6-NEXT:    s_and_b64 vcc, exec, s[6:7]
581; GFX6-NEXT:    s_add_i32 s2, s2, s8
582; GFX6-NEXT:    s_cbranch_vccnz .LBB2_1
583; GFX6-NEXT:  ; %bb.2: ; %ComputeEnd
584; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
585; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
586; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
587; GFX6-NEXT:    ; implicit-def: $vgpr0
588; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
589; GFX6-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
590; GFX6-NEXT:    s_cbranch_execz .LBB2_4
591; GFX6-NEXT:  ; %bb.3:
592; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
593; GFX6-NEXT:    v_mov_b32_e32 v0, s2
594; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
595; GFX6-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
596; GFX6-NEXT:  .LBB2_4:
597; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
598; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
599; GFX6-NEXT:    s_mov_b32 s3, 0xf000
600; GFX6-NEXT:    s_mov_b32 s2, -1
601; GFX6-NEXT:    s_waitcnt vmcnt(0)
602; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
603; GFX6-NEXT:    s_waitcnt expcnt(0)
604; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v1
605; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
606; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
607; GFX6-NEXT:    s_endpgm
608;
609; GFX8-LABEL: add_i32_varying_vdata:
610; GFX8:       ; %bb.0: ; %entry
611; GFX8-NEXT:    s_mov_b64 s[0:1], exec
612; GFX8-NEXT:    s_mov_b32 s2, 0
613; GFX8-NEXT:    ; implicit-def: $vgpr1
614; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
615; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
616; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
617; GFX8-NEXT:    s_mov_b32 m0, s3
618; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
619; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
620; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
621; GFX8-NEXT:    s_add_i32 s2, s2, s8
622; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
623; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
624; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
625; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
626; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
627; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
628; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
629; GFX8-NEXT:    ; implicit-def: $vgpr0
630; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
631; GFX8-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
632; GFX8-NEXT:    s_cbranch_execz .LBB2_4
633; GFX8-NEXT:  ; %bb.3:
634; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
635; GFX8-NEXT:    v_mov_b32_e32 v0, s2
636; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
637; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
638; GFX8-NEXT:  .LBB2_4:
639; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
640; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
641; GFX8-NEXT:    s_waitcnt vmcnt(0)
642; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
643; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v1
644; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
645; GFX8-NEXT:    v_mov_b32_e32 v0, s0
646; GFX8-NEXT:    v_mov_b32_e32 v1, s1
647; GFX8-NEXT:    flat_store_dword v[0:1], v2
648; GFX8-NEXT:    s_endpgm
649;
650; GFX9-LABEL: add_i32_varying_vdata:
651; GFX9:       ; %bb.0: ; %entry
652; GFX9-NEXT:    s_mov_b64 s[0:1], exec
653; GFX9-NEXT:    s_mov_b32 s2, 0
654; GFX9-NEXT:    ; implicit-def: $vgpr1
655; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
656; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
657; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
658; GFX9-NEXT:    s_mov_b32 m0, s3
659; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
660; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
661; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
662; GFX9-NEXT:    s_add_i32 s2, s2, s8
663; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
664; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
665; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
666; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
667; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
668; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
669; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
670; GFX9-NEXT:    ; implicit-def: $vgpr0
671; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
672; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
673; GFX9-NEXT:    s_cbranch_execz .LBB2_4
674; GFX9-NEXT:  ; %bb.3:
675; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
676; GFX9-NEXT:    v_mov_b32_e32 v0, s2
677; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
678; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
679; GFX9-NEXT:  .LBB2_4:
680; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
681; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
682; GFX9-NEXT:    s_waitcnt vmcnt(0)
683; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
684; GFX9-NEXT:    v_mov_b32_e32 v2, 0
685; GFX9-NEXT:    v_add_u32_e32 v0, s2, v1
686; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
687; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
688; GFX9-NEXT:    s_endpgm
689;
690; GFX10W64-LABEL: add_i32_varying_vdata:
691; GFX10W64:       ; %bb.0: ; %entry
692; GFX10W64-NEXT:    s_mov_b64 s[0:1], exec
693; GFX10W64-NEXT:    s_mov_b32 s2, 0
694; GFX10W64-NEXT:    ; implicit-def: $vgpr1
695; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
696; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
697; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
698; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
699; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
700; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
701; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
702; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
703; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
704; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
705; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
706; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
707; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
708; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
709; GFX10W64-NEXT:    ; implicit-def: $vgpr0
710; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
711; GFX10W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
712; GFX10W64-NEXT:    s_cbranch_execz .LBB2_4
713; GFX10W64-NEXT:  ; %bb.3:
714; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
715; GFX10W64-NEXT:    v_mov_b32_e32 v0, s2
716; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
717; GFX10W64-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
718; GFX10W64-NEXT:  .LBB2_4:
719; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
720; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
721; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
722; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
723; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
724; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
725; GFX10W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
726; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
727; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
728; GFX10W64-NEXT:    s_endpgm
729;
730; GFX10W32-LABEL: add_i32_varying_vdata:
731; GFX10W32:       ; %bb.0: ; %entry
732; GFX10W32-NEXT:    s_mov_b32 s1, exec_lo
733; GFX10W32-NEXT:    s_mov_b32 s0, 0
734; GFX10W32-NEXT:    ; implicit-def: $vgpr1
735; GFX10W32-NEXT:  .LBB2_1: ; %ComputeLoop
736; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
737; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
738; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
739; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
740; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
741; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
742; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
743; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
744; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
745; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
746; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
747; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
748; GFX10W32-NEXT:    ; implicit-def: $vgpr0
749; GFX10W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
750; GFX10W32-NEXT:    s_xor_b32 s1, exec_lo, s1
751; GFX10W32-NEXT:    s_cbranch_execz .LBB2_4
752; GFX10W32-NEXT:  ; %bb.3:
753; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
754; GFX10W32-NEXT:    v_mov_b32_e32 v0, s0
755; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
756; GFX10W32-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
757; GFX10W32-NEXT:  .LBB2_4:
758; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
759; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
760; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
761; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
762; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
763; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
764; GFX10W32-NEXT:    v_add_nc_u32_e32 v1, s2, v1
765; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
766; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
767; GFX10W32-NEXT:    s_endpgm
768;
769; GFX11W64-LABEL: add_i32_varying_vdata:
770; GFX11W64:       ; %bb.0: ; %entry
771; GFX11W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
772; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
773; GFX11W64-NEXT:    s_mov_b32 s2, 0
774; GFX11W64-NEXT:    ; implicit-def: $vgpr0
775; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
776; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
777; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
778; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
779; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
780; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
781; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
782; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
783; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
784; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
785; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
786; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
787; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
788; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
789; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
790; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
791; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
792; GFX11W64-NEXT:    ; implicit-def: $vgpr1
793; GFX11W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
794; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
795; GFX11W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
796; GFX11W64-NEXT:    s_cbranch_execz .LBB2_4
797; GFX11W64-NEXT:  ; %bb.3:
798; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
799; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
800; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
801; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
802; GFX11W64-NEXT:  .LBB2_4:
803; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
804; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
805; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
806; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
807; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
808; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
809; GFX11W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
810; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
811; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
812; GFX11W64-NEXT:    s_endpgm
813;
814; GFX11W32-LABEL: add_i32_varying_vdata:
815; GFX11W32:       ; %bb.0: ; %entry
816; GFX11W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
817; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
818; GFX11W32-NEXT:    s_mov_b32 s0, 0
819; GFX11W32-NEXT:    ; implicit-def: $vgpr0
820; GFX11W32-NEXT:  .LBB2_1: ; %ComputeLoop
821; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
822; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
823; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
824; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
825; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
826; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
827; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
828; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
829; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
830; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
831; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
832; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
833; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
834; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
835; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
836; GFX11W32-NEXT:    ; implicit-def: $vgpr1
837; GFX11W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
838; GFX11W32-NEXT:    s_xor_b32 s1, exec_lo, s1
839; GFX11W32-NEXT:    s_cbranch_execz .LBB2_4
840; GFX11W32-NEXT:  ; %bb.3:
841; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
842; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
843; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
844; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
845; GFX11W32-NEXT:  .LBB2_4:
846; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
847; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
848; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
849; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
850; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
851; GFX11W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
852; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
853; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
854; GFX11W32-NEXT:    s_endpgm
855;
856; GFX12W64-LABEL: add_i32_varying_vdata:
857; GFX12W64:       ; %bb.0: ; %entry
858; GFX12W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
859; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
860; GFX12W64-NEXT:    s_mov_b32 s2, 0
861; GFX12W64-NEXT:    ; implicit-def: $vgpr0
862; GFX12W64-NEXT:  .LBB2_1: ; %ComputeLoop
863; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
864; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
865; GFX12W64-NEXT:    s_wait_alu 0xfffe
866; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
867; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
868; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
869; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
870; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
871; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
872; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
873; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
874; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
875; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
876; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
877; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
878; GFX12W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
879; GFX12W64-NEXT:    ; implicit-def: $vgpr1
880; GFX12W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
881; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
882; GFX12W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
883; GFX12W64-NEXT:    s_cbranch_execz .LBB2_4
884; GFX12W64-NEXT:  ; %bb.3:
885; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
886; GFX12W64-NEXT:    s_wait_alu 0xfffe
887; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
888; GFX12W64-NEXT:    s_wait_kmcnt 0x0
889; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
890; GFX12W64-NEXT:  .LBB2_4:
891; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
892; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
893; GFX12W64-NEXT:    s_wait_loadcnt 0x0
894; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
895; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
896; GFX12W64-NEXT:    s_wait_alu 0xfffe
897; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
898; GFX12W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
899; GFX12W64-NEXT:    s_wait_kmcnt 0x0
900; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
901; GFX12W64-NEXT:    s_endpgm
902;
903; GFX12W32-LABEL: add_i32_varying_vdata:
904; GFX12W32:       ; %bb.0: ; %entry
905; GFX12W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
906; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
907; GFX12W32-NEXT:    s_mov_b32 s0, 0
908; GFX12W32-NEXT:    ; implicit-def: $vgpr0
909; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
910; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
911; GFX12W32-NEXT:    s_wait_alu 0xfffe
912; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
913; GFX12W32-NEXT:    s_wait_alu 0xfffe
914; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
915; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
916; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
917; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
918; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
919; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
920; GFX12W32-NEXT:    s_wait_alu 0xfffe
921; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
922; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
923; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
924; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
925; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
926; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
927; GFX12W32-NEXT:    ; implicit-def: $vgpr1
928; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
929; GFX12W32-NEXT:    s_wait_alu 0xfffe
930; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
931; GFX12W32-NEXT:    s_cbranch_execz .LBB2_4
932; GFX12W32-NEXT:  ; %bb.3:
933; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
934; GFX12W32-NEXT:    v_mov_b32_e32 v1, s0
935; GFX12W32-NEXT:    s_wait_kmcnt 0x0
936; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
937; GFX12W32-NEXT:  .LBB2_4:
938; GFX12W32-NEXT:    s_wait_alu 0xfffe
939; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
940; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
941; GFX12W32-NEXT:    s_wait_loadcnt 0x0
942; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
943; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
944; GFX12W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
945; GFX12W32-NEXT:    s_wait_kmcnt 0x0
946; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
947; GFX12W32-NEXT:    s_endpgm
948entry:
949  %lane = call i32 @llvm.amdgcn.workitem.id.x()
950  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
951  store i32 %old, ptr addrspace(1) %out
952  ret void
953}
954
955define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %vindex) {
956; GFX6-LABEL: struct_add_i32_varying_vdata:
957; GFX6:       ; %bb.0: ; %entry
958; GFX6-NEXT:    s_mov_b64 s[0:1], exec
959; GFX6-NEXT:    s_mov_b32 s2, 0
960; GFX6-NEXT:    ; implicit-def: $vgpr1
961; GFX6-NEXT:  .LBB3_1: ; %ComputeLoop
962; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
963; GFX6-NEXT:    s_ff1_i32_b64 s3, s[0:1]
964; GFX6-NEXT:    s_mov_b32 m0, s3
965; GFX6-NEXT:    v_readlane_b32 s8, v0, s3
966; GFX6-NEXT:    v_writelane_b32 v1, s2, m0
967; GFX6-NEXT:    s_lshl_b64 s[6:7], 1, s3
968; GFX6-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
969; GFX6-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
970; GFX6-NEXT:    s_and_b64 vcc, exec, s[6:7]
971; GFX6-NEXT:    s_add_i32 s2, s2, s8
972; GFX6-NEXT:    s_cbranch_vccnz .LBB3_1
973; GFX6-NEXT:  ; %bb.2: ; %ComputeEnd
974; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
975; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
976; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
977; GFX6-NEXT:    ; implicit-def: $vgpr0
978; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
979; GFX6-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
980; GFX6-NEXT:    s_cbranch_execz .LBB3_4
981; GFX6-NEXT:  ; %bb.3:
982; GFX6-NEXT:    s_load_dword s3, s[4:5], 0x11
983; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
984; GFX6-NEXT:    v_mov_b32_e32 v0, s2
985; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
986; GFX6-NEXT:    v_mov_b32_e32 v2, s3
987; GFX6-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
988; GFX6-NEXT:  .LBB3_4:
989; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
990; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
991; GFX6-NEXT:    s_mov_b32 s3, 0xf000
992; GFX6-NEXT:    s_mov_b32 s2, -1
993; GFX6-NEXT:    s_waitcnt vmcnt(0)
994; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
995; GFX6-NEXT:    s_waitcnt expcnt(0)
996; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v1
997; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
998; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
999; GFX6-NEXT:    s_endpgm
1000;
1001; GFX8-LABEL: struct_add_i32_varying_vdata:
1002; GFX8:       ; %bb.0: ; %entry
1003; GFX8-NEXT:    s_mov_b64 s[0:1], exec
1004; GFX8-NEXT:    s_mov_b32 s2, 0
1005; GFX8-NEXT:    ; implicit-def: $vgpr1
1006; GFX8-NEXT:  .LBB3_1: ; %ComputeLoop
1007; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1008; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1009; GFX8-NEXT:    s_mov_b32 m0, s3
1010; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
1011; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
1012; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
1013; GFX8-NEXT:    s_add_i32 s2, s2, s8
1014; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
1015; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
1016; GFX8-NEXT:    s_cbranch_scc1 .LBB3_1
1017; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
1018; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1019; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1020; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1021; GFX8-NEXT:    ; implicit-def: $vgpr0
1022; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1023; GFX8-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1024; GFX8-NEXT:    s_cbranch_execz .LBB3_4
1025; GFX8-NEXT:  ; %bb.3:
1026; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x44
1027; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1028; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1029; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1030; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1031; GFX8-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
1032; GFX8-NEXT:  .LBB3_4:
1033; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1034; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1035; GFX8-NEXT:    s_waitcnt vmcnt(0)
1036; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1037; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v1
1038; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1039; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1040; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1041; GFX8-NEXT:    flat_store_dword v[0:1], v2
1042; GFX8-NEXT:    s_endpgm
1043;
1044; GFX9-LABEL: struct_add_i32_varying_vdata:
1045; GFX9:       ; %bb.0: ; %entry
1046; GFX9-NEXT:    s_mov_b64 s[0:1], exec
1047; GFX9-NEXT:    s_mov_b32 s2, 0
1048; GFX9-NEXT:    ; implicit-def: $vgpr1
1049; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
1050; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
1051; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1052; GFX9-NEXT:    s_mov_b32 m0, s3
1053; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
1054; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
1055; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
1056; GFX9-NEXT:    s_add_i32 s2, s2, s8
1057; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
1058; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
1059; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
1060; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
1061; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1062; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1063; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1064; GFX9-NEXT:    ; implicit-def: $vgpr0
1065; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1066; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1067; GFX9-NEXT:    s_cbranch_execz .LBB3_4
1068; GFX9-NEXT:  ; %bb.3:
1069; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x44
1070; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1071; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1072; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1073; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1074; GFX9-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
1075; GFX9-NEXT:  .LBB3_4:
1076; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1077; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1078; GFX9-NEXT:    s_waitcnt vmcnt(0)
1079; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
1080; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1081; GFX9-NEXT:    v_add_u32_e32 v0, s2, v1
1082; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1083; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
1084; GFX9-NEXT:    s_endpgm
1085;
1086; GFX10W64-LABEL: struct_add_i32_varying_vdata:
1087; GFX10W64:       ; %bb.0: ; %entry
1088; GFX10W64-NEXT:    s_mov_b64 s[0:1], exec
1089; GFX10W64-NEXT:    s_mov_b32 s2, 0
1090; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1091; GFX10W64-NEXT:  .LBB3_1: ; %ComputeLoop
1092; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1093; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1094; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
1095; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
1096; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
1097; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
1098; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
1099; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
1100; GFX10W64-NEXT:    s_cbranch_scc1 .LBB3_1
1101; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
1102; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1103; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1104; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1105; GFX10W64-NEXT:    ; implicit-def: $vgpr0
1106; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1107; GFX10W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1108; GFX10W64-NEXT:    s_cbranch_execz .LBB3_4
1109; GFX10W64-NEXT:  ; %bb.3:
1110; GFX10W64-NEXT:    s_clause 0x1
1111; GFX10W64-NEXT:    s_load_dword s3, s[4:5], 0x44
1112; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1113; GFX10W64-NEXT:    v_mov_b32_e32 v0, s2
1114; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1115; GFX10W64-NEXT:    v_mov_b32_e32 v2, s3
1116; GFX10W64-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
1117; GFX10W64-NEXT:  .LBB3_4:
1118; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1119; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1120; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1121; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1122; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
1123; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
1124; GFX10W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
1125; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1126; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
1127; GFX10W64-NEXT:    s_endpgm
1128;
1129; GFX10W32-LABEL: struct_add_i32_varying_vdata:
1130; GFX10W32:       ; %bb.0: ; %entry
1131; GFX10W32-NEXT:    s_mov_b32 s1, exec_lo
1132; GFX10W32-NEXT:    s_mov_b32 s0, 0
1133; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1134; GFX10W32-NEXT:  .LBB3_1: ; %ComputeLoop
1135; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1136; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
1137; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
1138; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
1139; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
1140; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
1141; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
1142; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
1143; GFX10W32-NEXT:    s_cbranch_scc1 .LBB3_1
1144; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
1145; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1146; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1147; GFX10W32-NEXT:    ; implicit-def: $vgpr0
1148; GFX10W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1149; GFX10W32-NEXT:    s_xor_b32 s1, exec_lo, s1
1150; GFX10W32-NEXT:    s_cbranch_execz .LBB3_4
1151; GFX10W32-NEXT:  ; %bb.3:
1152; GFX10W32-NEXT:    s_clause 0x1
1153; GFX10W32-NEXT:    s_load_dword s2, s[4:5], 0x44
1154; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1155; GFX10W32-NEXT:    v_mov_b32_e32 v0, s0
1156; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1157; GFX10W32-NEXT:    v_mov_b32_e32 v2, s2
1158; GFX10W32-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
1159; GFX10W32-NEXT:  .LBB3_4:
1160; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1161; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1162; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1163; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1164; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
1165; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
1166; GFX10W32-NEXT:    v_add_nc_u32_e32 v1, s2, v1
1167; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1168; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
1169; GFX10W32-NEXT:    s_endpgm
1170;
1171; GFX11W64-LABEL: struct_add_i32_varying_vdata:
1172; GFX11W64:       ; %bb.0: ; %entry
1173; GFX11W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
1174; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
1175; GFX11W64-NEXT:    s_mov_b32 s2, 0
1176; GFX11W64-NEXT:    ; implicit-def: $vgpr0
1177; GFX11W64-NEXT:  .LBB3_1: ; %ComputeLoop
1178; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1179; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
1180; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1181; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
1182; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
1183; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
1184; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
1185; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1186; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
1187; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
1188; GFX11W64-NEXT:    s_cbranch_scc1 .LBB3_1
1189; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
1190; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
1191; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1192; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
1193; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1194; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1195; GFX11W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1196; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1197; GFX11W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1198; GFX11W64-NEXT:    s_cbranch_execz .LBB3_4
1199; GFX11W64-NEXT:  ; %bb.3:
1200; GFX11W64-NEXT:    s_clause 0x1
1201; GFX11W64-NEXT:    s_load_b32 s3, s[4:5], 0x44
1202; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1203; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
1204; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1205; GFX11W64-NEXT:    v_mov_b32_e32 v2, s3
1206; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
1207; GFX11W64-NEXT:  .LBB3_4:
1208; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1209; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1210; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1211; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
1212; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1213; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1214; GFX11W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
1215; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1216; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1217; GFX11W64-NEXT:    s_endpgm
1218;
1219; GFX11W32-LABEL: struct_add_i32_varying_vdata:
1220; GFX11W32:       ; %bb.0: ; %entry
1221; GFX11W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
1222; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
1223; GFX11W32-NEXT:    s_mov_b32 s0, 0
1224; GFX11W32-NEXT:    ; implicit-def: $vgpr0
1225; GFX11W32-NEXT:  .LBB3_1: ; %ComputeLoop
1226; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1227; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
1228; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1229; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
1230; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
1231; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
1232; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
1233; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1234; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
1235; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
1236; GFX11W32-NEXT:    s_cbranch_scc1 .LBB3_1
1237; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
1238; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
1239; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1240; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
1241; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1242; GFX11W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1243; GFX11W32-NEXT:    s_xor_b32 s1, exec_lo, s1
1244; GFX11W32-NEXT:    s_cbranch_execz .LBB3_4
1245; GFX11W32-NEXT:  ; %bb.3:
1246; GFX11W32-NEXT:    s_clause 0x1
1247; GFX11W32-NEXT:    s_load_b32 s2, s[4:5], 0x44
1248; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1249; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1250; GFX11W32-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s2
1251; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
1252; GFX11W32-NEXT:  .LBB3_4:
1253; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1254; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1255; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1256; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
1257; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1258; GFX11W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
1259; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1260; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
1261; GFX11W32-NEXT:    s_endpgm
1262;
1263; GFX12W64-LABEL: struct_add_i32_varying_vdata:
1264; GFX12W64:       ; %bb.0: ; %entry
1265; GFX12W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
1266; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
1267; GFX12W64-NEXT:    s_mov_b32 s2, 0
1268; GFX12W64-NEXT:    ; implicit-def: $vgpr0
1269; GFX12W64-NEXT:  .LBB3_1: ; %ComputeLoop
1270; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1271; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
1272; GFX12W64-NEXT:    s_wait_alu 0xfffe
1273; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
1274; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
1275; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
1276; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
1277; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1278; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
1279; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
1280; GFX12W64-NEXT:    s_cbranch_scc1 .LBB3_1
1281; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
1282; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
1283; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1284; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
1285; GFX12W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1286; GFX12W64-NEXT:    ; implicit-def: $vgpr1
1287; GFX12W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1288; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1289; GFX12W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1290; GFX12W64-NEXT:    s_cbranch_execz .LBB3_4
1291; GFX12W64-NEXT:  ; %bb.3:
1292; GFX12W64-NEXT:    s_clause 0x1
1293; GFX12W64-NEXT:    s_load_b32 s3, s[4:5], 0x44
1294; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1295; GFX12W64-NEXT:    s_wait_alu 0xfffe
1296; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
1297; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1298; GFX12W64-NEXT:    v_mov_b32_e32 v2, s3
1299; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
1300; GFX12W64-NEXT:  .LBB3_4:
1301; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1302; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1303; GFX12W64-NEXT:    s_wait_loadcnt 0x0
1304; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
1305; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
1306; GFX12W64-NEXT:    s_wait_alu 0xfffe
1307; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1308; GFX12W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
1309; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1310; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1311; GFX12W64-NEXT:    s_endpgm
1312;
1313; GFX12W32-LABEL: struct_add_i32_varying_vdata:
1314; GFX12W32:       ; %bb.0: ; %entry
1315; GFX12W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
1316; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
1317; GFX12W32-NEXT:    s_mov_b32 s0, 0
1318; GFX12W32-NEXT:    ; implicit-def: $vgpr0
1319; GFX12W32-NEXT:  .LBB3_1: ; %ComputeLoop
1320; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1321; GFX12W32-NEXT:    s_wait_alu 0xfffe
1322; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
1323; GFX12W32-NEXT:    s_wait_alu 0xfffe
1324; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
1325; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
1326; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
1327; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
1328; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1329; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
1330; GFX12W32-NEXT:    s_wait_alu 0xfffe
1331; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
1332; GFX12W32-NEXT:    s_cbranch_scc1 .LBB3_1
1333; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
1334; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
1335; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1336; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
1337; GFX12W32-NEXT:    ; implicit-def: $vgpr1
1338; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1339; GFX12W32-NEXT:    s_wait_alu 0xfffe
1340; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
1341; GFX12W32-NEXT:    s_cbranch_execz .LBB3_4
1342; GFX12W32-NEXT:  ; %bb.3:
1343; GFX12W32-NEXT:    s_clause 0x1
1344; GFX12W32-NEXT:    s_load_b32 s2, s[4:5], 0x44
1345; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1346; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1347; GFX12W32-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s2
1348; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
1349; GFX12W32-NEXT:  .LBB3_4:
1350; GFX12W32-NEXT:    s_wait_alu 0xfffe
1351; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1352; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1353; GFX12W32-NEXT:    s_wait_loadcnt 0x0
1354; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
1355; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1356; GFX12W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
1357; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1358; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
1359; GFX12W32-NEXT:    s_endpgm
1360entry:
1361  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1362  %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 %lane, ptr addrspace(8) %inout, i32 %vindex, i32 0, i32 0, i32 0)
1363  store i32 %old, ptr addrspace(1) %out
1364  ret void
1365}
1366
1367define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
1368; GFX6-LABEL: add_i32_varying_offset:
1369; GFX6:       ; %bb.0: ; %entry
1370; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
1371; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
1372; GFX6-NEXT:    v_mov_b32_e32 v1, 1
1373; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1374; GFX6-NEXT:    buffer_atomic_add v1, v0, s[0:3], 0 offen glc
1375; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1376; GFX6-NEXT:    s_mov_b32 s6, -1
1377; GFX6-NEXT:    s_waitcnt vmcnt(0)
1378; GFX6-NEXT:    buffer_store_dword v1, off, s[4:7], 0
1379; GFX6-NEXT:    s_endpgm
1380;
1381; GFX8-LABEL: add_i32_varying_offset:
1382; GFX8:       ; %bb.0: ; %entry
1383; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
1384; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1385; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1386; GFX8-NEXT:    buffer_atomic_add v2, v0, s[0:3], 0 offen glc
1387; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1388; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1389; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1390; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1391; GFX8-NEXT:    s_waitcnt vmcnt(0)
1392; GFX8-NEXT:    flat_store_dword v[0:1], v2
1393; GFX8-NEXT:    s_endpgm
1394;
1395; GFX9-LABEL: add_i32_varying_offset:
1396; GFX9:       ; %bb.0: ; %entry
1397; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
1398; GFX9-NEXT:    v_mov_b32_e32 v1, 1
1399; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1400; GFX9-NEXT:    buffer_atomic_add v1, v0, s[0:3], 0 offen glc
1401; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1402; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1403; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1404; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1405; GFX9-NEXT:    s_endpgm
1406;
1407; GFX10-LABEL: add_i32_varying_offset:
1408; GFX10:       ; %bb.0: ; %entry
1409; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
1410; GFX10-NEXT:    v_mov_b32_e32 v1, 1
1411; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1412; GFX10-NEXT:    buffer_atomic_add v1, v0, s[0:3], 0 offen glc
1413; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
1414; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1415; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1416; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1417; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1418; GFX10-NEXT:    s_endpgm
1419;
1420; GFX11W64-LABEL: add_i32_varying_offset:
1421; GFX11W64:       ; %bb.0: ; %entry
1422; GFX11W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1423; GFX11W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1424; GFX11W64-NEXT:    v_mov_b32_e32 v1, 1
1425; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1426; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, v0, s[0:3], 0 offen glc
1427; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1428; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
1429; GFX11W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1430; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
1431; GFX11W64-NEXT:    s_endpgm
1432;
1433; GFX11W32-LABEL: add_i32_varying_offset:
1434; GFX11W32:       ; %bb.0: ; %entry
1435; GFX11W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1436; GFX11W32-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
1437; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1438; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, v0, s[0:3], 0 offen glc
1439; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1440; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
1441; GFX11W32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1442; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
1443; GFX11W32-NEXT:    s_endpgm
1444;
1445; GFX12W64-LABEL: add_i32_varying_offset:
1446; GFX12W64:       ; %bb.0: ; %entry
1447; GFX12W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1448; GFX12W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1449; GFX12W64-NEXT:    v_mov_b32_e32 v1, 1
1450; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1451; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN
1452; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1453; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
1454; GFX12W64-NEXT:    s_wait_loadcnt 0x0
1455; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1456; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
1457; GFX12W64-NEXT:    s_endpgm
1458;
1459; GFX12W32-LABEL: add_i32_varying_offset:
1460; GFX12W32:       ; %bb.0: ; %entry
1461; GFX12W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
1462; GFX12W32-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
1463; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1464; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN
1465; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1466; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
1467; GFX12W32-NEXT:    s_wait_loadcnt 0x0
1468; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1469; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
1470; GFX12W32-NEXT:    s_endpgm
1471entry:
1472  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1473  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0)
1474  store i32 %old, ptr addrspace(1) %out
1475  ret void
1476}
1477
1478define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
1479; GFX6-LABEL: sub_i32_constant:
1480; GFX6:       ; %bb.0: ; %entry
1481; GFX6-NEXT:    s_mov_b64 s[2:3], exec
1482; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1483; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1484; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1485; GFX6-NEXT:    ; implicit-def: $vgpr1
1486; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1487; GFX6-NEXT:    s_cbranch_execz .LBB5_2
1488; GFX6-NEXT:  ; %bb.1:
1489; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
1490; GFX6-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1491; GFX6-NEXT:    s_mul_i32 s2, s2, 5
1492; GFX6-NEXT:    v_mov_b32_e32 v1, s2
1493; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1494; GFX6-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1495; GFX6-NEXT:  .LBB5_2:
1496; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
1497; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1498; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1499; GFX6-NEXT:    s_mov_b32 s2, -1
1500; GFX6-NEXT:    s_waitcnt vmcnt(0)
1501; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
1502; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1503; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1504; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1505; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1506; GFX6-NEXT:    s_endpgm
1507;
1508; GFX8-LABEL: sub_i32_constant:
1509; GFX8:       ; %bb.0: ; %entry
1510; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1511; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1512; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1513; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1514; GFX8-NEXT:    ; implicit-def: $vgpr1
1515; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1516; GFX8-NEXT:    s_cbranch_execz .LBB5_2
1517; GFX8-NEXT:  ; %bb.1:
1518; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1519; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1520; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1521; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1522; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1523; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1524; GFX8-NEXT:  .LBB5_2:
1525; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1526; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1527; GFX8-NEXT:    s_waitcnt vmcnt(0)
1528; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1529; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1530; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v0
1531; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1532; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1533; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1534; GFX8-NEXT:    flat_store_dword v[0:1], v2
1535; GFX8-NEXT:    s_endpgm
1536;
1537; GFX9-LABEL: sub_i32_constant:
1538; GFX9:       ; %bb.0: ; %entry
1539; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1540; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1541; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1542; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1543; GFX9-NEXT:    ; implicit-def: $vgpr1
1544; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1545; GFX9-NEXT:    s_cbranch_execz .LBB5_2
1546; GFX9-NEXT:  ; %bb.1:
1547; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1548; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1549; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1550; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1551; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1552; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1553; GFX9-NEXT:  .LBB5_2:
1554; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1555; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1556; GFX9-NEXT:    s_waitcnt vmcnt(0)
1557; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1558; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1559; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1560; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1561; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1562; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
1563; GFX9-NEXT:    s_endpgm
1564;
1565; GFX10W64-LABEL: sub_i32_constant:
1566; GFX10W64:       ; %bb.0: ; %entry
1567; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
1568; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1569; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1570; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1571; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1572; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1573; GFX10W64-NEXT:    s_cbranch_execz .LBB5_2
1574; GFX10W64-NEXT:  ; %bb.1:
1575; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1576; GFX10W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1577; GFX10W64-NEXT:    s_mul_i32 s2, s2, 5
1578; GFX10W64-NEXT:    v_mov_b32_e32 v1, s2
1579; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1580; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1581; GFX10W64-NEXT:  .LBB5_2:
1582; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1583; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1584; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1585; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1586; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
1587; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1588; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1589; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1590; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1591; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
1592; GFX10W64-NEXT:    s_endpgm
1593;
1594; GFX10W32-LABEL: sub_i32_constant:
1595; GFX10W32:       ; %bb.0: ; %entry
1596; GFX10W32-NEXT:    s_mov_b32 s1, exec_lo
1597; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1598; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1599; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1600; GFX10W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1601; GFX10W32-NEXT:    s_cbranch_execz .LBB5_2
1602; GFX10W32-NEXT:  ; %bb.1:
1603; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1604; GFX10W32-NEXT:    s_bcnt1_i32_b32 s1, s1
1605; GFX10W32-NEXT:    s_mul_i32 s1, s1, 5
1606; GFX10W32-NEXT:    v_mov_b32_e32 v1, s1
1607; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1608; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1609; GFX10W32-NEXT:  .LBB5_2:
1610; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1611; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1612; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1613; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1614; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
1615; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1616; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1617; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1618; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1619; GFX10W32-NEXT:    global_store_dword v1, v0, s[0:1]
1620; GFX10W32-NEXT:    s_endpgm
1621;
1622; GFX11W64-LABEL: sub_i32_constant:
1623; GFX11W64:       ; %bb.0: ; %entry
1624; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
1625; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
1626; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1627; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1628; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1629; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1630; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1631; GFX11W64-NEXT:    s_cbranch_execz .LBB5_2
1632; GFX11W64-NEXT:  ; %bb.1:
1633; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1634; GFX11W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1635; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1636; GFX11W64-NEXT:    s_mul_i32 s2, s2, 5
1637; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
1638; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1639; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1640; GFX11W64-NEXT:  .LBB5_2:
1641; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1642; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1643; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1644; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
1645; GFX11W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1646; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1647; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1648; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1649; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1650; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1651; GFX11W64-NEXT:    s_endpgm
1652;
1653; GFX11W32-LABEL: sub_i32_constant:
1654; GFX11W32:       ; %bb.0: ; %entry
1655; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
1656; GFX11W32-NEXT:    s_mov_b32 s0, exec_lo
1657; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1658; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1659; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1660; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1661; GFX11W32-NEXT:    s_cbranch_execz .LBB5_2
1662; GFX11W32-NEXT:  ; %bb.1:
1663; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1664; GFX11W32-NEXT:    s_bcnt1_i32_b32 s1, s1
1665; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1666; GFX11W32-NEXT:    s_mul_i32 s1, s1, 5
1667; GFX11W32-NEXT:    v_mov_b32_e32 v1, s1
1668; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1669; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1670; GFX11W32-NEXT:  .LBB5_2:
1671; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1672; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1673; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1674; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
1675; GFX11W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1676; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1677; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1678; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1679; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1680; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
1681; GFX11W32-NEXT:    s_endpgm
1682;
1683; GFX12W64-LABEL: sub_i32_constant:
1684; GFX12W64:       ; %bb.0: ; %entry
1685; GFX12W64-NEXT:    s_mov_b64 s[2:3], exec
1686; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
1687; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1688; GFX12W64-NEXT:    ; implicit-def: $vgpr1
1689; GFX12W64-NEXT:    s_wait_alu 0xfffe
1690; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1691; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1692; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1693; GFX12W64-NEXT:    s_cbranch_execz .LBB5_2
1694; GFX12W64-NEXT:  ; %bb.1:
1695; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1696; GFX12W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1697; GFX12W64-NEXT:    s_wait_alu 0xfffe
1698; GFX12W64-NEXT:    s_mul_i32 s2, s2, 5
1699; GFX12W64-NEXT:    s_wait_alu 0xfffe
1700; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
1701; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1702; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
1703; GFX12W64-NEXT:  .LBB5_2:
1704; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1705; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1706; GFX12W64-NEXT:    s_wait_loadcnt 0x0
1707; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
1708; GFX12W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1709; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
1710; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1711; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1712; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1713; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1714; GFX12W64-NEXT:    s_endpgm
1715;
1716; GFX12W32-LABEL: sub_i32_constant:
1717; GFX12W32:       ; %bb.0: ; %entry
1718; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
1719; GFX12W32-NEXT:    s_mov_b32 s0, exec_lo
1720; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
1721; GFX12W32-NEXT:    ; implicit-def: $vgpr1
1722; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1723; GFX12W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1724; GFX12W32-NEXT:    s_cbranch_execz .LBB5_2
1725; GFX12W32-NEXT:  ; %bb.1:
1726; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1727; GFX12W32-NEXT:    s_wait_alu 0xfffe
1728; GFX12W32-NEXT:    s_bcnt1_i32_b32 s1, s1
1729; GFX12W32-NEXT:    s_wait_alu 0xfffe
1730; GFX12W32-NEXT:    s_mul_i32 s1, s1, 5
1731; GFX12W32-NEXT:    s_wait_alu 0xfffe
1732; GFX12W32-NEXT:    v_mov_b32_e32 v1, s1
1733; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1734; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
1735; GFX12W32-NEXT:  .LBB5_2:
1736; GFX12W32-NEXT:    s_wait_alu 0xfffe
1737; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1738; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1739; GFX12W32-NEXT:    s_wait_loadcnt 0x0
1740; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
1741; GFX12W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1742; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
1743; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1744; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1745; GFX12W32-NEXT:    s_wait_kmcnt 0x0
1746; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
1747; GFX12W32-NEXT:    s_endpgm
1748entry:
1749  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
1750  store i32 %old, ptr addrspace(1) %out
1751  ret void
1752}
1753
1754define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) {
1755; GFX6-LABEL: sub_i32_uniform:
1756; GFX6:       ; %bb.0: ; %entry
1757; GFX6-NEXT:    s_mov_b64 s[2:3], exec
1758; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x11
1759; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1760; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1761; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1762; GFX6-NEXT:    ; implicit-def: $vgpr1
1763; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1764; GFX6-NEXT:    s_cbranch_execz .LBB6_2
1765; GFX6-NEXT:  ; %bb.1:
1766; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
1767; GFX6-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1768; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1769; GFX6-NEXT:    s_mul_i32 s2, s6, s2
1770; GFX6-NEXT:    v_mov_b32_e32 v1, s2
1771; GFX6-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1772; GFX6-NEXT:  .LBB6_2:
1773; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
1774; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1775; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1776; GFX6-NEXT:    s_mov_b32 s2, -1
1777; GFX6-NEXT:    s_waitcnt vmcnt(0)
1778; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
1779; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1780; GFX6-NEXT:    v_mul_lo_u32 v0, s6, v0
1781; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1782; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1783; GFX6-NEXT:    s_endpgm
1784;
1785; GFX8-LABEL: sub_i32_uniform:
1786; GFX8:       ; %bb.0: ; %entry
1787; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x44
1788; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1789; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1790; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1791; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1792; GFX8-NEXT:    ; implicit-def: $vgpr1
1793; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1794; GFX8-NEXT:    s_cbranch_execz .LBB6_2
1795; GFX8-NEXT:  ; %bb.1:
1796; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1797; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1798; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1799; GFX8-NEXT:    s_mul_i32 s2, s6, s2
1800; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1801; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1802; GFX8-NEXT:  .LBB6_2:
1803; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1804; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1805; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1806; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
1807; GFX8-NEXT:    s_waitcnt vmcnt(0)
1808; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1809; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v0
1810; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1811; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1812; GFX8-NEXT:    flat_store_dword v[0:1], v2
1813; GFX8-NEXT:    s_endpgm
1814;
1815; GFX9-LABEL: sub_i32_uniform:
1816; GFX9:       ; %bb.0: ; %entry
1817; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x44
1818; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1819; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1820; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1821; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1822; GFX9-NEXT:    ; implicit-def: $vgpr1
1823; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1824; GFX9-NEXT:    s_cbranch_execz .LBB6_2
1825; GFX9-NEXT:  ; %bb.1:
1826; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1827; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1828; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1829; GFX9-NEXT:    s_mul_i32 s2, s6, s2
1830; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1831; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1832; GFX9-NEXT:  .LBB6_2:
1833; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1834; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1835; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1836; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
1837; GFX9-NEXT:    s_waitcnt vmcnt(0)
1838; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1839; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1840; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1841; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
1842; GFX9-NEXT:    s_endpgm
1843;
1844; GFX10W64-LABEL: sub_i32_uniform:
1845; GFX10W64:       ; %bb.0: ; %entry
1846; GFX10W64-NEXT:    s_load_dword s6, s[4:5], 0x44
1847; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
1848; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1849; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1850; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1851; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1852; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1853; GFX10W64-NEXT:    s_cbranch_execz .LBB6_2
1854; GFX10W64-NEXT:  ; %bb.1:
1855; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1856; GFX10W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1857; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1858; GFX10W64-NEXT:    s_mul_i32 s2, s6, s2
1859; GFX10W64-NEXT:    v_mov_b32_e32 v1, s2
1860; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1861; GFX10W64-NEXT:  .LBB6_2:
1862; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1863; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1864; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1865; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1866; GFX10W64-NEXT:    v_mul_lo_u32 v0, s6, v0
1867; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1868; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
1869; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1870; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1871; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
1872; GFX10W64-NEXT:    s_endpgm
1873;
1874; GFX10W32-LABEL: sub_i32_uniform:
1875; GFX10W32:       ; %bb.0: ; %entry
1876; GFX10W32-NEXT:    s_load_dword s0, s[4:5], 0x44
1877; GFX10W32-NEXT:    s_mov_b32 s2, exec_lo
1878; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1879; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1880; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1881; GFX10W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1882; GFX10W32-NEXT:    s_cbranch_execz .LBB6_2
1883; GFX10W32-NEXT:  ; %bb.1:
1884; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
1885; GFX10W32-NEXT:    s_bcnt1_i32_b32 s2, s2
1886; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1887; GFX10W32-NEXT:    s_mul_i32 s2, s0, s2
1888; GFX10W32-NEXT:    v_mov_b32_e32 v1, s2
1889; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1890; GFX10W32-NEXT:  .LBB6_2:
1891; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1892; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1893; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1894; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1895; GFX10W32-NEXT:    v_mul_lo_u32 v0, s0, v0
1896; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1897; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
1898; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1899; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1900; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
1901; GFX10W32-NEXT:    s_endpgm
1902;
1903; GFX11W64-LABEL: sub_i32_uniform:
1904; GFX11W64:       ; %bb.0: ; %entry
1905; GFX11W64-NEXT:    s_load_b32 s6, s[4:5], 0x44
1906; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
1907; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
1908; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1909; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1910; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1911; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1912; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1913; GFX11W64-NEXT:    s_cbranch_execz .LBB6_2
1914; GFX11W64-NEXT:  ; %bb.1:
1915; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1916; GFX11W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1917; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1918; GFX11W64-NEXT:    s_mul_i32 s2, s6, s2
1919; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1920; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
1921; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1922; GFX11W64-NEXT:  .LBB6_2:
1923; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1924; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1925; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1926; GFX11W64-NEXT:    v_mul_lo_u32 v0, s6, v0
1927; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1928; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
1929; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1930; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1931; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1932; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1933; GFX11W64-NEXT:    s_endpgm
1934;
1935; GFX11W32-LABEL: sub_i32_uniform:
1936; GFX11W32:       ; %bb.0: ; %entry
1937; GFX11W32-NEXT:    s_load_b32 s0, s[4:5], 0x44
1938; GFX11W32-NEXT:    s_mov_b32 s2, exec_lo
1939; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
1940; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1941; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1942; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1943; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1944; GFX11W32-NEXT:    s_cbranch_execz .LBB6_2
1945; GFX11W32-NEXT:  ; %bb.1:
1946; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1947; GFX11W32-NEXT:    s_bcnt1_i32_b32 s2, s2
1948; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1949; GFX11W32-NEXT:    s_mul_i32 s2, s0, s2
1950; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1951; GFX11W32-NEXT:    v_mov_b32_e32 v1, s2
1952; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1953; GFX11W32-NEXT:  .LBB6_2:
1954; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1955; GFX11W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
1956; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1957; GFX11W32-NEXT:    v_mul_lo_u32 v0, s0, v0
1958; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1959; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
1960; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1961; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1962; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1963; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
1964; GFX11W32-NEXT:    s_endpgm
1965;
1966; GFX12W64-LABEL: sub_i32_uniform:
1967; GFX12W64:       ; %bb.0: ; %entry
1968; GFX12W64-NEXT:    s_load_b32 s6, s[4:5], 0x44
1969; GFX12W64-NEXT:    s_mov_b64 s[2:3], exec
1970; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
1971; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1972; GFX12W64-NEXT:    ; implicit-def: $vgpr1
1973; GFX12W64-NEXT:    s_wait_alu 0xfffe
1974; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1975; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1976; GFX12W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1977; GFX12W64-NEXT:    s_cbranch_execz .LBB6_2
1978; GFX12W64-NEXT:  ; %bb.1:
1979; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
1980; GFX12W64-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1981; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1982; GFX12W64-NEXT:    s_wait_alu 0xfffe
1983; GFX12W64-NEXT:    s_mul_i32 s2, s6, s2
1984; GFX12W64-NEXT:    s_wait_alu 0xfffe
1985; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
1986; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
1987; GFX12W64-NEXT:  .LBB6_2:
1988; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1989; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1990; GFX12W64-NEXT:    s_wait_kmcnt 0x0
1991; GFX12W64-NEXT:    v_mul_lo_u32 v0, s6, v0
1992; GFX12W64-NEXT:    s_wait_loadcnt 0x0
1993; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
1994; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
1995; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1996; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1997; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
1998; GFX12W64-NEXT:    s_endpgm
1999;
2000; GFX12W32-LABEL: sub_i32_uniform:
2001; GFX12W32:       ; %bb.0: ; %entry
2002; GFX12W32-NEXT:    s_load_b32 s0, s[4:5], 0x44
2003; GFX12W32-NEXT:    s_mov_b32 s2, exec_lo
2004; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
2005; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2006; GFX12W32-NEXT:    ; implicit-def: $vgpr1
2007; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2008; GFX12W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
2009; GFX12W32-NEXT:    s_cbranch_execz .LBB6_2
2010; GFX12W32-NEXT:  ; %bb.1:
2011; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
2012; GFX12W32-NEXT:    s_wait_alu 0xfffe
2013; GFX12W32-NEXT:    s_bcnt1_i32_b32 s2, s2
2014; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2015; GFX12W32-NEXT:    s_wait_alu 0xfffe
2016; GFX12W32-NEXT:    s_mul_i32 s2, s0, s2
2017; GFX12W32-NEXT:    s_wait_alu 0xfffe
2018; GFX12W32-NEXT:    v_mov_b32_e32 v1, s2
2019; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
2020; GFX12W32-NEXT:  .LBB6_2:
2021; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2022; GFX12W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
2023; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2024; GFX12W32-NEXT:    v_mul_lo_u32 v0, s0, v0
2025; GFX12W32-NEXT:    s_wait_loadcnt 0x0
2026; GFX12W32-NEXT:    v_readfirstlane_b32 s0, v1
2027; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
2028; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2029; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2030; GFX12W32-NEXT:    global_store_b32 v1, v0, s[2:3]
2031; GFX12W32-NEXT:    s_endpgm
2032entry:
2033  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
2034  store i32 %old, ptr addrspace(1) %out
2035  ret void
2036}
2037
2038define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
2039; GFX6-LABEL: sub_i32_varying_vdata:
2040; GFX6:       ; %bb.0: ; %entry
2041; GFX6-NEXT:    s_mov_b64 s[0:1], exec
2042; GFX6-NEXT:    s_mov_b32 s2, 0
2043; GFX6-NEXT:    ; implicit-def: $vgpr1
2044; GFX6-NEXT:  .LBB7_1: ; %ComputeLoop
2045; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
2046; GFX6-NEXT:    s_ff1_i32_b64 s3, s[0:1]
2047; GFX6-NEXT:    s_mov_b32 m0, s3
2048; GFX6-NEXT:    v_readlane_b32 s8, v0, s3
2049; GFX6-NEXT:    v_writelane_b32 v1, s2, m0
2050; GFX6-NEXT:    s_lshl_b64 s[6:7], 1, s3
2051; GFX6-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
2052; GFX6-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
2053; GFX6-NEXT:    s_and_b64 vcc, exec, s[6:7]
2054; GFX6-NEXT:    s_add_i32 s2, s2, s8
2055; GFX6-NEXT:    s_cbranch_vccnz .LBB7_1
2056; GFX6-NEXT:  ; %bb.2: ; %ComputeEnd
2057; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2058; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
2059; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2060; GFX6-NEXT:    ; implicit-def: $vgpr0
2061; GFX6-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2062; GFX6-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
2063; GFX6-NEXT:    s_cbranch_execz .LBB7_4
2064; GFX6-NEXT:  ; %bb.3:
2065; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
2066; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2067; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2068; GFX6-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
2069; GFX6-NEXT:  .LBB7_4:
2070; GFX6-NEXT:    s_or_b64 exec, exec, s[0:1]
2071; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2072; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2073; GFX6-NEXT:    s_mov_b32 s2, -1
2074; GFX6-NEXT:    s_waitcnt vmcnt(0)
2075; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
2076; GFX6-NEXT:    s_waitcnt expcnt(0)
2077; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v1
2078; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2079; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2080; GFX6-NEXT:    s_endpgm
2081;
2082; GFX8-LABEL: sub_i32_varying_vdata:
2083; GFX8:       ; %bb.0: ; %entry
2084; GFX8-NEXT:    s_mov_b64 s[0:1], exec
2085; GFX8-NEXT:    s_mov_b32 s2, 0
2086; GFX8-NEXT:    ; implicit-def: $vgpr1
2087; GFX8-NEXT:  .LBB7_1: ; %ComputeLoop
2088; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2089; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
2090; GFX8-NEXT:    s_mov_b32 m0, s3
2091; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
2092; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
2093; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
2094; GFX8-NEXT:    s_add_i32 s2, s2, s8
2095; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
2096; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
2097; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
2098; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
2099; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2100; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2101; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2102; GFX8-NEXT:    ; implicit-def: $vgpr0
2103; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2104; GFX8-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
2105; GFX8-NEXT:    s_cbranch_execz .LBB7_4
2106; GFX8-NEXT:  ; %bb.3:
2107; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
2108; GFX8-NEXT:    v_mov_b32_e32 v0, s2
2109; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2110; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
2111; GFX8-NEXT:  .LBB7_4:
2112; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
2113; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2114; GFX8-NEXT:    s_waitcnt vmcnt(0)
2115; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2116; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v1
2117; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2118; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2119; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2120; GFX8-NEXT:    flat_store_dword v[0:1], v2
2121; GFX8-NEXT:    s_endpgm
2122;
2123; GFX9-LABEL: sub_i32_varying_vdata:
2124; GFX9:       ; %bb.0: ; %entry
2125; GFX9-NEXT:    s_mov_b64 s[0:1], exec
2126; GFX9-NEXT:    s_mov_b32 s2, 0
2127; GFX9-NEXT:    ; implicit-def: $vgpr1
2128; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
2129; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2130; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
2131; GFX9-NEXT:    s_mov_b32 m0, s3
2132; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
2133; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
2134; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
2135; GFX9-NEXT:    s_add_i32 s2, s2, s8
2136; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
2137; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
2138; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
2139; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
2140; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2141; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2142; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2143; GFX9-NEXT:    ; implicit-def: $vgpr0
2144; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2145; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
2146; GFX9-NEXT:    s_cbranch_execz .LBB7_4
2147; GFX9-NEXT:  ; %bb.3:
2148; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
2149; GFX9-NEXT:    v_mov_b32_e32 v0, s2
2150; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2151; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
2152; GFX9-NEXT:  .LBB7_4:
2153; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
2154; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2155; GFX9-NEXT:    s_waitcnt vmcnt(0)
2156; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2157; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2158; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v1
2159; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2160; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
2161; GFX9-NEXT:    s_endpgm
2162;
2163; GFX10W64-LABEL: sub_i32_varying_vdata:
2164; GFX10W64:       ; %bb.0: ; %entry
2165; GFX10W64-NEXT:    s_mov_b64 s[0:1], exec
2166; GFX10W64-NEXT:    s_mov_b32 s2, 0
2167; GFX10W64-NEXT:    ; implicit-def: $vgpr1
2168; GFX10W64-NEXT:  .LBB7_1: ; %ComputeLoop
2169; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
2170; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
2171; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
2172; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
2173; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
2174; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
2175; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
2176; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
2177; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
2178; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
2179; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2180; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2181; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2182; GFX10W64-NEXT:    ; implicit-def: $vgpr0
2183; GFX10W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2184; GFX10W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
2185; GFX10W64-NEXT:    s_cbranch_execz .LBB7_4
2186; GFX10W64-NEXT:  ; %bb.3:
2187; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
2188; GFX10W64-NEXT:    v_mov_b32_e32 v0, s2
2189; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
2190; GFX10W64-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
2191; GFX10W64-NEXT:  .LBB7_4:
2192; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
2193; GFX10W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2194; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2195; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
2196; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
2197; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
2198; GFX10W64-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
2199; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
2200; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
2201; GFX10W64-NEXT:    s_endpgm
2202;
2203; GFX10W32-LABEL: sub_i32_varying_vdata:
2204; GFX10W32:       ; %bb.0: ; %entry
2205; GFX10W32-NEXT:    s_mov_b32 s1, exec_lo
2206; GFX10W32-NEXT:    s_mov_b32 s0, 0
2207; GFX10W32-NEXT:    ; implicit-def: $vgpr1
2208; GFX10W32-NEXT:  .LBB7_1: ; %ComputeLoop
2209; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
2210; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
2211; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
2212; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
2213; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
2214; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
2215; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
2216; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
2217; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
2218; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
2219; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2220; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2221; GFX10W32-NEXT:    ; implicit-def: $vgpr0
2222; GFX10W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2223; GFX10W32-NEXT:    s_xor_b32 s1, exec_lo, s1
2224; GFX10W32-NEXT:    s_cbranch_execz .LBB7_4
2225; GFX10W32-NEXT:  ; %bb.3:
2226; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
2227; GFX10W32-NEXT:    v_mov_b32_e32 v0, s0
2228; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
2229; GFX10W32-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
2230; GFX10W32-NEXT:  .LBB7_4:
2231; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
2232; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2233; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2234; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
2235; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
2236; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
2237; GFX10W32-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
2238; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
2239; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
2240; GFX10W32-NEXT:    s_endpgm
2241;
2242; GFX11W64-LABEL: sub_i32_varying_vdata:
2243; GFX11W64:       ; %bb.0: ; %entry
2244; GFX11W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
2245; GFX11W64-NEXT:    s_mov_b64 s[0:1], exec
2246; GFX11W64-NEXT:    s_mov_b32 s2, 0
2247; GFX11W64-NEXT:    ; implicit-def: $vgpr0
2248; GFX11W64-NEXT:  .LBB7_1: ; %ComputeLoop
2249; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
2250; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
2251; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
2252; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
2253; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
2254; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
2255; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
2256; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2257; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
2258; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
2259; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
2260; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
2261; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
2262; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2263; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
2264; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2265; GFX11W64-NEXT:    ; implicit-def: $vgpr1
2266; GFX11W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2267; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2268; GFX11W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
2269; GFX11W64-NEXT:    s_cbranch_execz .LBB7_4
2270; GFX11W64-NEXT:  ; %bb.3:
2271; GFX11W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
2272; GFX11W64-NEXT:    v_mov_b32_e32 v1, s2
2273; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
2274; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
2275; GFX11W64-NEXT:  .LBB7_4:
2276; GFX11W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2277; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2278; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
2279; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
2280; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
2281; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2282; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
2283; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
2284; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
2285; GFX11W64-NEXT:    s_endpgm
2286;
2287; GFX11W32-LABEL: sub_i32_varying_vdata:
2288; GFX11W32:       ; %bb.0: ; %entry
2289; GFX11W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
2290; GFX11W32-NEXT:    s_mov_b32 s1, exec_lo
2291; GFX11W32-NEXT:    s_mov_b32 s0, 0
2292; GFX11W32-NEXT:    ; implicit-def: $vgpr0
2293; GFX11W32-NEXT:  .LBB7_1: ; %ComputeLoop
2294; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
2295; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
2296; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
2297; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
2298; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
2299; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
2300; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
2301; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2302; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
2303; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
2304; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
2305; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
2306; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
2307; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2308; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2309; GFX11W32-NEXT:    ; implicit-def: $vgpr1
2310; GFX11W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2311; GFX11W32-NEXT:    s_xor_b32 s1, exec_lo, s1
2312; GFX11W32-NEXT:    s_cbranch_execz .LBB7_4
2313; GFX11W32-NEXT:  ; %bb.3:
2314; GFX11W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
2315; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
2316; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
2317; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
2318; GFX11W32-NEXT:  .LBB7_4:
2319; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2320; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2321; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
2322; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
2323; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
2324; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2325; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
2326; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
2327; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
2328; GFX11W32-NEXT:    s_endpgm
2329;
2330; GFX12W64-LABEL: sub_i32_varying_vdata:
2331; GFX12W64:       ; %bb.0: ; %entry
2332; GFX12W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
2333; GFX12W64-NEXT:    s_mov_b64 s[0:1], exec
2334; GFX12W64-NEXT:    s_mov_b32 s2, 0
2335; GFX12W64-NEXT:    ; implicit-def: $vgpr0
2336; GFX12W64-NEXT:  .LBB7_1: ; %ComputeLoop
2337; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
2338; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
2339; GFX12W64-NEXT:    s_wait_alu 0xfffe
2340; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
2341; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
2342; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
2343; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
2344; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2345; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
2346; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
2347; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
2348; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
2349; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
2350; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2351; GFX12W64-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
2352; GFX12W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2353; GFX12W64-NEXT:    ; implicit-def: $vgpr1
2354; GFX12W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2355; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2356; GFX12W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
2357; GFX12W64-NEXT:    s_cbranch_execz .LBB7_4
2358; GFX12W64-NEXT:  ; %bb.3:
2359; GFX12W64-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
2360; GFX12W64-NEXT:    s_wait_alu 0xfffe
2361; GFX12W64-NEXT:    v_mov_b32_e32 v1, s2
2362; GFX12W64-NEXT:    s_wait_kmcnt 0x0
2363; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
2364; GFX12W64-NEXT:  .LBB7_4:
2365; GFX12W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2366; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2367; GFX12W64-NEXT:    s_wait_loadcnt 0x0
2368; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
2369; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
2370; GFX12W64-NEXT:    s_wait_alu 0xfffe
2371; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2372; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
2373; GFX12W64-NEXT:    s_wait_kmcnt 0x0
2374; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
2375; GFX12W64-NEXT:    s_endpgm
2376;
2377; GFX12W32-LABEL: sub_i32_varying_vdata:
2378; GFX12W32:       ; %bb.0: ; %entry
2379; GFX12W32-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
2380; GFX12W32-NEXT:    s_mov_b32 s1, exec_lo
2381; GFX12W32-NEXT:    s_mov_b32 s0, 0
2382; GFX12W32-NEXT:    ; implicit-def: $vgpr0
2383; GFX12W32-NEXT:  .LBB7_1: ; %ComputeLoop
2384; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
2385; GFX12W32-NEXT:    s_wait_alu 0xfffe
2386; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
2387; GFX12W32-NEXT:    s_wait_alu 0xfffe
2388; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
2389; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
2390; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
2391; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
2392; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2393; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
2394; GFX12W32-NEXT:    s_wait_alu 0xfffe
2395; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
2396; GFX12W32-NEXT:    s_cbranch_scc1 .LBB7_1
2397; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
2398; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
2399; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2400; GFX12W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2401; GFX12W32-NEXT:    ; implicit-def: $vgpr1
2402; GFX12W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2403; GFX12W32-NEXT:    s_wait_alu 0xfffe
2404; GFX12W32-NEXT:    s_xor_b32 s1, exec_lo, s1
2405; GFX12W32-NEXT:    s_cbranch_execz .LBB7_4
2406; GFX12W32-NEXT:  ; %bb.3:
2407; GFX12W32-NEXT:    s_load_b128 s[8:11], s[4:5], 0x34
2408; GFX12W32-NEXT:    v_mov_b32_e32 v1, s0
2409; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2410; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
2411; GFX12W32-NEXT:  .LBB7_4:
2412; GFX12W32-NEXT:    s_wait_alu 0xfffe
2413; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2414; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2415; GFX12W32-NEXT:    s_wait_loadcnt 0x0
2416; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
2417; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
2418; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2419; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
2420; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2421; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
2422; GFX12W32-NEXT:    s_endpgm
2423entry:
2424  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2425  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 %lane, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
2426  store i32 %old, ptr addrspace(1) %out
2427  ret void
2428}
2429
2430define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
2431; GFX6-LABEL: sub_i32_varying_offset:
2432; GFX6:       ; %bb.0: ; %entry
2433; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
2434; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
2435; GFX6-NEXT:    v_mov_b32_e32 v1, 1
2436; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2437; GFX6-NEXT:    buffer_atomic_sub v1, v0, s[0:3], 0 offen glc
2438; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2439; GFX6-NEXT:    s_mov_b32 s6, -1
2440; GFX6-NEXT:    s_waitcnt vmcnt(0)
2441; GFX6-NEXT:    buffer_store_dword v1, off, s[4:7], 0
2442; GFX6-NEXT:    s_endpgm
2443;
2444; GFX8-LABEL: sub_i32_varying_offset:
2445; GFX8:       ; %bb.0: ; %entry
2446; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
2447; GFX8-NEXT:    v_mov_b32_e32 v2, 1
2448; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2449; GFX8-NEXT:    buffer_atomic_sub v2, v0, s[0:3], 0 offen glc
2450; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2451; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2452; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2453; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2454; GFX8-NEXT:    s_waitcnt vmcnt(0)
2455; GFX8-NEXT:    flat_store_dword v[0:1], v2
2456; GFX8-NEXT:    s_endpgm
2457;
2458; GFX9-LABEL: sub_i32_varying_offset:
2459; GFX9:       ; %bb.0: ; %entry
2460; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
2461; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2462; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2463; GFX9-NEXT:    buffer_atomic_sub v1, v0, s[0:3], 0 offen glc
2464; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2465; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2466; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2467; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2468; GFX9-NEXT:    s_endpgm
2469;
2470; GFX10-LABEL: sub_i32_varying_offset:
2471; GFX10:       ; %bb.0: ; %entry
2472; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
2473; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2474; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2475; GFX10-NEXT:    buffer_atomic_sub v1, v0, s[0:3], 0 offen glc
2476; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2477; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2478; GFX10-NEXT:    v_mov_b32_e32 v0, 0
2479; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2480; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2481; GFX10-NEXT:    s_endpgm
2482;
2483; GFX11W64-LABEL: sub_i32_varying_offset:
2484; GFX11W64:       ; %bb.0: ; %entry
2485; GFX11W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2486; GFX11W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2487; GFX11W64-NEXT:    v_mov_b32_e32 v1, 1
2488; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
2489; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, v0, s[0:3], 0 offen glc
2490; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2491; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
2492; GFX11W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2493; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
2494; GFX11W64-NEXT:    s_endpgm
2495;
2496; GFX11W32-LABEL: sub_i32_varying_offset:
2497; GFX11W32:       ; %bb.0: ; %entry
2498; GFX11W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2499; GFX11W32-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
2500; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
2501; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, v0, s[0:3], 0 offen glc
2502; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2503; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
2504; GFX11W32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2505; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
2506; GFX11W32-NEXT:    s_endpgm
2507;
2508; GFX12W64-LABEL: sub_i32_varying_offset:
2509; GFX12W64:       ; %bb.0: ; %entry
2510; GFX12W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2511; GFX12W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2512; GFX12W64-NEXT:    v_mov_b32_e32 v1, 1
2513; GFX12W64-NEXT:    s_wait_kmcnt 0x0
2514; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN
2515; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2516; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
2517; GFX12W64-NEXT:    s_wait_loadcnt 0x0
2518; GFX12W64-NEXT:    s_wait_kmcnt 0x0
2519; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
2520; GFX12W64-NEXT:    s_endpgm
2521;
2522; GFX12W32-LABEL: sub_i32_varying_offset:
2523; GFX12W32:       ; %bb.0: ; %entry
2524; GFX12W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
2525; GFX12W32-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
2526; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2527; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN
2528; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2529; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
2530; GFX12W32-NEXT:    s_wait_loadcnt 0x0
2531; GFX12W32-NEXT:    s_wait_kmcnt 0x0
2532; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
2533; GFX12W32-NEXT:    s_endpgm
2534entry:
2535  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2536  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0)
2537  store i32 %old, ptr addrspace(1) %out
2538  ret void
2539}
2540;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2541; GFX11: {{.*}}
2542; GFX12: {{.*}}
2543