xref: /llvm-project/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
9; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s
10; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s
11; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
12; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
13; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
14; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
15; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
16; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
17; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
18; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s
19; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s
20
21declare i32 @llvm.amdgcn.workitem.id.x()
22
23; Show what the atomic optimization pass will do for global pointers.
24
25define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
26; GFX7LESS-LABEL: add_i32_constant:
27; GFX7LESS:       ; %bb.0: ; %entry
28; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
29; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
30; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
31; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
32; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
33; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
34; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
35; GFX7LESS-NEXT:    s_cbranch_execz .LBB0_2
36; GFX7LESS-NEXT:  ; %bb.1:
37; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
38; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
39; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
40; GFX7LESS-NEXT:    s_mov_b32 s10, -1
41; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX7LESS-NEXT:    s_mov_b32 s8, s2
43; GFX7LESS-NEXT:    s_mov_b32 s9, s3
44; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
45; GFX7LESS-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
46; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
47; GFX7LESS-NEXT:    buffer_wbinvl1
48; GFX7LESS-NEXT:  .LBB0_2:
49; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
50; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
51; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
52; GFX7LESS-NEXT:    s_mov_b32 s2, -1
53; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
54; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
55; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
56; GFX7LESS-NEXT:    s_endpgm
57;
58; GFX8-LABEL: add_i32_constant:
59; GFX8:       ; %bb.0: ; %entry
60; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
61; GFX8-NEXT:    s_mov_b64 s[6:7], exec
62; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
63; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
64; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
65; GFX8-NEXT:    ; implicit-def: $vgpr1
66; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
67; GFX8-NEXT:    s_cbranch_execz .LBB0_2
68; GFX8-NEXT:  ; %bb.1:
69; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
70; GFX8-NEXT:    s_mov_b32 s8, s2
71; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
72; GFX8-NEXT:    s_mul_i32 s2, s2, 5
73; GFX8-NEXT:    s_mov_b32 s11, 0xf000
74; GFX8-NEXT:    s_mov_b32 s10, -1
75; GFX8-NEXT:    s_mov_b32 s9, s3
76; GFX8-NEXT:    v_mov_b32_e32 v1, s2
77; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
78; GFX8-NEXT:    s_waitcnt vmcnt(0)
79; GFX8-NEXT:    buffer_wbinvl1_vol
80; GFX8-NEXT:  .LBB0_2:
81; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
82; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
83; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
84; GFX8-NEXT:    s_mov_b32 s3, 0xf000
85; GFX8-NEXT:    s_mov_b32 s2, -1
86; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
87; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
88; GFX8-NEXT:    s_endpgm
89;
90; GFX9-LABEL: add_i32_constant:
91; GFX9:       ; %bb.0: ; %entry
92; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
93; GFX9-NEXT:    s_mov_b64 s[6:7], exec
94; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
95; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
96; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
97; GFX9-NEXT:    ; implicit-def: $vgpr1
98; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
99; GFX9-NEXT:    s_cbranch_execz .LBB0_2
100; GFX9-NEXT:  ; %bb.1:
101; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX9-NEXT:    s_mov_b32 s8, s2
103; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
104; GFX9-NEXT:    s_mul_i32 s2, s2, 5
105; GFX9-NEXT:    s_mov_b32 s11, 0xf000
106; GFX9-NEXT:    s_mov_b32 s10, -1
107; GFX9-NEXT:    s_mov_b32 s9, s3
108; GFX9-NEXT:    v_mov_b32_e32 v1, s2
109; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
110; GFX9-NEXT:    s_waitcnt vmcnt(0)
111; GFX9-NEXT:    buffer_wbinvl1_vol
112; GFX9-NEXT:  .LBB0_2:
113; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
114; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
115; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
116; GFX9-NEXT:    s_mov_b32 s3, 0xf000
117; GFX9-NEXT:    s_mov_b32 s2, -1
118; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
119; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
120; GFX9-NEXT:    s_endpgm
121;
122; GFX1064-LABEL: add_i32_constant:
123; GFX1064:       ; %bb.0: ; %entry
124; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
125; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
126; GFX1064-NEXT:    ; implicit-def: $vgpr1
127; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
128; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
129; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
130; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
131; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
132; GFX1064-NEXT:  ; %bb.1:
133; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
134; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
135; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
136; GFX1064-NEXT:    s_mov_b32 s10, -1
137; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
138; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
139; GFX1064-NEXT:    s_mov_b32 s8, s2
140; GFX1064-NEXT:    s_mov_b32 s9, s3
141; GFX1064-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
142; GFX1064-NEXT:    s_waitcnt vmcnt(0)
143; GFX1064-NEXT:    buffer_gl1_inv
144; GFX1064-NEXT:    buffer_gl0_inv
145; GFX1064-NEXT:  .LBB0_2:
146; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
147; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
148; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
149; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
150; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
151; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
152; GFX1064-NEXT:    s_mov_b32 s2, -1
153; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
154; GFX1064-NEXT:    s_endpgm
155;
156; GFX1032-LABEL: add_i32_constant:
157; GFX1032:       ; %bb.0: ; %entry
158; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
159; GFX1032-NEXT:    s_mov_b32 s6, exec_lo
160; GFX1032-NEXT:    ; implicit-def: $vgpr1
161; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
162; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
163; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
164; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
165; GFX1032-NEXT:  ; %bb.1:
166; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s6
167; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
168; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
169; GFX1032-NEXT:    s_mov_b32 s10, -1
170; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
171; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
172; GFX1032-NEXT:    s_mov_b32 s8, s2
173; GFX1032-NEXT:    s_mov_b32 s9, s3
174; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
175; GFX1032-NEXT:    s_waitcnt vmcnt(0)
176; GFX1032-NEXT:    buffer_gl1_inv
177; GFX1032-NEXT:    buffer_gl0_inv
178; GFX1032-NEXT:  .LBB0_2:
179; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
180; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
181; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
182; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
183; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
184; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
185; GFX1032-NEXT:    s_mov_b32 s2, -1
186; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
187; GFX1032-NEXT:    s_endpgm
188;
189; GFX1164-LABEL: add_i32_constant:
190; GFX1164:       ; %bb.0: ; %entry
191; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
192; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
193; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
194; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
195; GFX1164-NEXT:    ; implicit-def: $vgpr1
196; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
197; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
198; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
199; GFX1164-NEXT:    s_cbranch_execz .LBB0_2
200; GFX1164-NEXT:  ; %bb.1:
201; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
202; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
203; GFX1164-NEXT:    s_mul_i32 s6, s6, 5
204; GFX1164-NEXT:    s_mov_b32 s10, -1
205; GFX1164-NEXT:    v_mov_b32_e32 v1, s6
206; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
207; GFX1164-NEXT:    s_mov_b32 s8, s2
208; GFX1164-NEXT:    s_mov_b32 s9, s3
209; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
210; GFX1164-NEXT:    s_waitcnt vmcnt(0)
211; GFX1164-NEXT:    buffer_gl1_inv
212; GFX1164-NEXT:    buffer_gl0_inv
213; GFX1164-NEXT:  .LBB0_2:
214; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
215; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
216; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
217; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
218; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
219; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
220; GFX1164-NEXT:    s_mov_b32 s2, -1
221; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
222; GFX1164-NEXT:    s_endpgm
223;
224; GFX1132-LABEL: add_i32_constant:
225; GFX1132:       ; %bb.0: ; %entry
226; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
227; GFX1132-NEXT:    s_mov_b32 s6, exec_lo
228; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
229; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
230; GFX1132-NEXT:    ; implicit-def: $vgpr1
231; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
232; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
233; GFX1132-NEXT:    s_cbranch_execz .LBB0_2
234; GFX1132-NEXT:  ; %bb.1:
235; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s6
236; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
237; GFX1132-NEXT:    s_mul_i32 s5, s5, 5
238; GFX1132-NEXT:    s_mov_b32 s10, -1
239; GFX1132-NEXT:    v_mov_b32_e32 v1, s5
240; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX1132-NEXT:    s_mov_b32 s8, s2
242; GFX1132-NEXT:    s_mov_b32 s9, s3
243; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
244; GFX1132-NEXT:    s_waitcnt vmcnt(0)
245; GFX1132-NEXT:    buffer_gl1_inv
246; GFX1132-NEXT:    buffer_gl0_inv
247; GFX1132-NEXT:  .LBB0_2:
248; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
249; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
250; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
251; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
252; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
253; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
254; GFX1132-NEXT:    s_mov_b32 s2, -1
255; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
256; GFX1132-NEXT:    s_endpgm
257;
258; GFX1264-LABEL: add_i32_constant:
259; GFX1264:       ; %bb.0: ; %entry
260; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
261; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
262; GFX1264-NEXT:    s_mov_b64 s[4:5], exec
263; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
264; GFX1264-NEXT:    ; implicit-def: $vgpr1
265; GFX1264-NEXT:    s_wait_alu 0xfffe
266; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
267; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
268; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v0
269; GFX1264-NEXT:    s_cbranch_execz .LBB0_2
270; GFX1264-NEXT:  ; %bb.1:
271; GFX1264-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
272; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
273; GFX1264-NEXT:    s_wait_alu 0xfffe
274; GFX1264-NEXT:    s_mul_i32 s6, s6, 5
275; GFX1264-NEXT:    s_mov_b32 s10, -1
276; GFX1264-NEXT:    s_wait_alu 0xfffe
277; GFX1264-NEXT:    v_mov_b32_e32 v1, s6
278; GFX1264-NEXT:    s_wait_kmcnt 0x0
279; GFX1264-NEXT:    s_mov_b32 s8, s2
280; GFX1264-NEXT:    s_mov_b32 s9, s3
281; GFX1264-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
282; GFX1264-NEXT:    s_wait_loadcnt 0x0
283; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
284; GFX1264-NEXT:  .LBB0_2:
285; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
286; GFX1264-NEXT:    s_wait_kmcnt 0x0
287; GFX1264-NEXT:    v_readfirstlane_b32 s2, v1
288; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
289; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
290; GFX1264-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
291; GFX1264-NEXT:    s_mov_b32 s2, -1
292; GFX1264-NEXT:    buffer_store_b32 v0, off, s[0:3], null
293; GFX1264-NEXT:    s_endpgm
294;
295; GFX1232-LABEL: add_i32_constant:
296; GFX1232:       ; %bb.0: ; %entry
297; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
298; GFX1232-NEXT:    s_mov_b32 s6, exec_lo
299; GFX1232-NEXT:    s_mov_b32 s4, exec_lo
300; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
301; GFX1232-NEXT:    ; implicit-def: $vgpr1
302; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
303; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v0
304; GFX1232-NEXT:    s_cbranch_execz .LBB0_2
305; GFX1232-NEXT:  ; %bb.1:
306; GFX1232-NEXT:    s_bcnt1_i32_b32 s5, s6
307; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
308; GFX1232-NEXT:    s_mul_i32 s5, s5, 5
309; GFX1232-NEXT:    s_mov_b32 s10, -1
310; GFX1232-NEXT:    v_mov_b32_e32 v1, s5
311; GFX1232-NEXT:    s_wait_kmcnt 0x0
312; GFX1232-NEXT:    s_mov_b32 s8, s2
313; GFX1232-NEXT:    s_mov_b32 s9, s3
314; GFX1232-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
315; GFX1232-NEXT:    s_wait_loadcnt 0x0
316; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
317; GFX1232-NEXT:  .LBB0_2:
318; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
319; GFX1232-NEXT:    s_wait_kmcnt 0x0
320; GFX1232-NEXT:    v_readfirstlane_b32 s2, v1
321; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
322; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
323; GFX1232-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
324; GFX1232-NEXT:    s_mov_b32 s2, -1
325; GFX1232-NEXT:    buffer_store_b32 v0, off, s[0:3], null
326; GFX1232-NEXT:    s_endpgm
327entry:
328  %old = atomicrmw add ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel
329  store i32 %old, ptr addrspace(1) %out
330  ret void
331}
332
333define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %additive) {
334; GFX7LESS-LABEL: add_i32_uniform:
335; GFX7LESS:       ; %bb.0: ; %entry
336; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
337; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
338; GFX7LESS-NEXT:    s_load_dword s8, s[4:5], 0xd
339; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
340; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
341; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
342; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
343; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
344; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
345; GFX7LESS-NEXT:  ; %bb.1:
346; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
347; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
348; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
349; GFX7LESS-NEXT:    s_mul_i32 s6, s8, s6
350; GFX7LESS-NEXT:    s_mov_b32 s14, -1
351; GFX7LESS-NEXT:    s_mov_b32 s12, s2
352; GFX7LESS-NEXT:    s_mov_b32 s13, s3
353; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
354; GFX7LESS-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
355; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
356; GFX7LESS-NEXT:    buffer_wbinvl1
357; GFX7LESS-NEXT:  .LBB1_2:
358; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
359; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
361; GFX7LESS-NEXT:    s_mov_b32 s2, -1
362; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
363; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s8, v0
364; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
365; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
366; GFX7LESS-NEXT:    s_endpgm
367;
368; GFX8-LABEL: add_i32_uniform:
369; GFX8:       ; %bb.0: ; %entry
370; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
371; GFX8-NEXT:    s_load_dword s8, s[4:5], 0x34
372; GFX8-NEXT:    s_mov_b64 s[6:7], exec
373; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
374; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
375; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
376; GFX8-NEXT:    ; implicit-def: $vgpr1
377; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
378; GFX8-NEXT:    s_cbranch_execz .LBB1_2
379; GFX8-NEXT:  ; %bb.1:
380; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
381; GFX8-NEXT:    s_mov_b32 s12, s2
382; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
383; GFX8-NEXT:    s_mul_i32 s2, s8, s2
384; GFX8-NEXT:    s_mov_b32 s15, 0xf000
385; GFX8-NEXT:    s_mov_b32 s14, -1
386; GFX8-NEXT:    s_mov_b32 s13, s3
387; GFX8-NEXT:    v_mov_b32_e32 v1, s2
388; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
389; GFX8-NEXT:    s_waitcnt vmcnt(0)
390; GFX8-NEXT:    buffer_wbinvl1_vol
391; GFX8-NEXT:  .LBB1_2:
392; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
393; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
395; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
396; GFX8-NEXT:    s_mov_b32 s3, 0xf000
397; GFX8-NEXT:    s_mov_b32 s2, -1
398; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
399; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
400; GFX8-NEXT:    s_endpgm
401;
402; GFX9-LABEL: add_i32_uniform:
403; GFX9:       ; %bb.0: ; %entry
404; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
405; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
406; GFX9-NEXT:    s_mov_b64 s[6:7], exec
407; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
408; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
409; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
410; GFX9-NEXT:    ; implicit-def: $vgpr1
411; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
412; GFX9-NEXT:    s_cbranch_execz .LBB1_2
413; GFX9-NEXT:  ; %bb.1:
414; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
415; GFX9-NEXT:    s_mov_b32 s12, s2
416; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
417; GFX9-NEXT:    s_mul_i32 s2, s8, s2
418; GFX9-NEXT:    s_mov_b32 s15, 0xf000
419; GFX9-NEXT:    s_mov_b32 s14, -1
420; GFX9-NEXT:    s_mov_b32 s13, s3
421; GFX9-NEXT:    v_mov_b32_e32 v1, s2
422; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
423; GFX9-NEXT:    s_waitcnt vmcnt(0)
424; GFX9-NEXT:    buffer_wbinvl1_vol
425; GFX9-NEXT:  .LBB1_2:
426; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
427; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
428; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
429; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
430; GFX9-NEXT:    s_mov_b32 s3, 0xf000
431; GFX9-NEXT:    s_mov_b32 s2, -1
432; GFX9-NEXT:    v_add_u32_e32 v0, s4, v0
433; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
434; GFX9-NEXT:    s_endpgm
435;
436; GFX1064-LABEL: add_i32_uniform:
437; GFX1064:       ; %bb.0: ; %entry
438; GFX1064-NEXT:    s_clause 0x1
439; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
440; GFX1064-NEXT:    s_load_dword s8, s[4:5], 0x34
441; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
442; GFX1064-NEXT:    ; implicit-def: $vgpr1
443; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
444; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
445; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
446; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
447; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
448; GFX1064-NEXT:  ; %bb.1:
449; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
450; GFX1064-NEXT:    s_mov_b32 s15, 0x31016000
451; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX1064-NEXT:    s_mul_i32 s6, s8, s6
453; GFX1064-NEXT:    s_mov_b32 s14, -1
454; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
455; GFX1064-NEXT:    s_mov_b32 s12, s2
456; GFX1064-NEXT:    s_mov_b32 s13, s3
457; GFX1064-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
458; GFX1064-NEXT:    s_waitcnt vmcnt(0)
459; GFX1064-NEXT:    buffer_gl1_inv
460; GFX1064-NEXT:    buffer_gl0_inv
461; GFX1064-NEXT:  .LBB1_2:
462; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
463; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
464; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
465; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
466; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s8, v0, s[2:3]
467; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
468; GFX1064-NEXT:    s_mov_b32 s2, -1
469; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
470; GFX1064-NEXT:    s_endpgm
471;
472; GFX1032-LABEL: add_i32_uniform:
473; GFX1032:       ; %bb.0: ; %entry
474; GFX1032-NEXT:    s_clause 0x1
475; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
476; GFX1032-NEXT:    s_load_dword s6, s[4:5], 0x34
477; GFX1032-NEXT:    s_mov_b32 s7, exec_lo
478; GFX1032-NEXT:    ; implicit-def: $vgpr1
479; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s7, 0
480; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
481; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
482; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
483; GFX1032-NEXT:  ; %bb.1:
484; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s7
485; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
486; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
487; GFX1032-NEXT:    s_mul_i32 s5, s6, s5
488; GFX1032-NEXT:    s_mov_b32 s10, -1
489; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
490; GFX1032-NEXT:    s_mov_b32 s8, s2
491; GFX1032-NEXT:    s_mov_b32 s9, s3
492; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
493; GFX1032-NEXT:    s_waitcnt vmcnt(0)
494; GFX1032-NEXT:    buffer_gl1_inv
495; GFX1032-NEXT:    buffer_gl0_inv
496; GFX1032-NEXT:  .LBB1_2:
497; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
498; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
499; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
500; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
501; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, s6, v0, s[2:3]
502; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
503; GFX1032-NEXT:    s_mov_b32 s2, -1
504; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
505; GFX1032-NEXT:    s_endpgm
506;
507; GFX1164-LABEL: add_i32_uniform:
508; GFX1164:       ; %bb.0: ; %entry
509; GFX1164-NEXT:    s_clause 0x1
510; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
511; GFX1164-NEXT:    s_load_b32 s8, s[4:5], 0x34
512; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
513; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
514; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
515; GFX1164-NEXT:    ; implicit-def: $vgpr1
516; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
517; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
518; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
519; GFX1164-NEXT:    s_cbranch_execz .LBB1_2
520; GFX1164-NEXT:  ; %bb.1:
521; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
522; GFX1164-NEXT:    s_mov_b32 s15, 0x31016000
523; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
524; GFX1164-NEXT:    s_mul_i32 s6, s8, s6
525; GFX1164-NEXT:    s_mov_b32 s14, -1
526; GFX1164-NEXT:    v_mov_b32_e32 v1, s6
527; GFX1164-NEXT:    s_mov_b32 s12, s2
528; GFX1164-NEXT:    s_mov_b32 s13, s3
529; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[12:15], 0 glc
530; GFX1164-NEXT:    s_waitcnt vmcnt(0)
531; GFX1164-NEXT:    buffer_gl1_inv
532; GFX1164-NEXT:    buffer_gl0_inv
533; GFX1164-NEXT:  .LBB1_2:
534; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
535; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
536; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
537; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
538; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s8, v0, s[2:3]
539; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
540; GFX1164-NEXT:    s_mov_b32 s2, -1
541; GFX1164-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
542; GFX1164-NEXT:    s_endpgm
543;
544; GFX1132-LABEL: add_i32_uniform:
545; GFX1132:       ; %bb.0: ; %entry
546; GFX1132-NEXT:    s_clause 0x1
547; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
548; GFX1132-NEXT:    s_load_b32 s4, s[4:5], 0x34
549; GFX1132-NEXT:    s_mov_b32 s6, exec_lo
550; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
551; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
552; GFX1132-NEXT:    ; implicit-def: $vgpr1
553; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
554; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
555; GFX1132-NEXT:    s_cbranch_execz .LBB1_2
556; GFX1132-NEXT:  ; %bb.1:
557; GFX1132-NEXT:    s_bcnt1_i32_b32 s6, s6
558; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
559; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX1132-NEXT:    s_mul_i32 s6, s4, s6
561; GFX1132-NEXT:    s_mov_b32 s10, -1
562; GFX1132-NEXT:    v_mov_b32_e32 v1, s6
563; GFX1132-NEXT:    s_mov_b32 s8, s2
564; GFX1132-NEXT:    s_mov_b32 s9, s3
565; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
566; GFX1132-NEXT:    s_waitcnt vmcnt(0)
567; GFX1132-NEXT:    buffer_gl1_inv
568; GFX1132-NEXT:    buffer_gl0_inv
569; GFX1132-NEXT:  .LBB1_2:
570; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s5
571; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
572; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
573; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
574; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], null, s4, v0, s[2:3]
575; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
576; GFX1132-NEXT:    s_mov_b32 s2, -1
577; GFX1132-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
578; GFX1132-NEXT:    s_endpgm
579;
580; GFX1264-LABEL: add_i32_uniform:
581; GFX1264:       ; %bb.0: ; %entry
582; GFX1264-NEXT:    s_clause 0x1
583; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
584; GFX1264-NEXT:    s_load_b32 s8, s[4:5], 0x34
585; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
586; GFX1264-NEXT:    s_mov_b64 s[4:5], exec
587; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
588; GFX1264-NEXT:    ; implicit-def: $vgpr1
589; GFX1264-NEXT:    s_wait_alu 0xfffe
590; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
591; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
592; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v0
593; GFX1264-NEXT:    s_cbranch_execz .LBB1_2
594; GFX1264-NEXT:  ; %bb.1:
595; GFX1264-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
596; GFX1264-NEXT:    s_mov_b32 s15, 0x31016000
597; GFX1264-NEXT:    s_wait_kmcnt 0x0
598; GFX1264-NEXT:    s_wait_alu 0xfffe
599; GFX1264-NEXT:    s_mul_i32 s6, s8, s6
600; GFX1264-NEXT:    s_mov_b32 s14, -1
601; GFX1264-NEXT:    s_wait_alu 0xfffe
602; GFX1264-NEXT:    v_mov_b32_e32 v1, s6
603; GFX1264-NEXT:    s_mov_b32 s12, s2
604; GFX1264-NEXT:    s_mov_b32 s13, s3
605; GFX1264-NEXT:    buffer_atomic_add_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
606; GFX1264-NEXT:    s_wait_loadcnt 0x0
607; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
608; GFX1264-NEXT:  .LBB1_2:
609; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
610; GFX1264-NEXT:    s_wait_kmcnt 0x0
611; GFX1264-NEXT:    v_readfirstlane_b32 s2, v1
612; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
613; GFX1264-NEXT:    v_mad_co_u64_u32 v[0:1], null, s8, v0, s[2:3]
614; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
615; GFX1264-NEXT:    s_mov_b32 s2, -1
616; GFX1264-NEXT:    buffer_store_b32 v0, off, s[0:3], null
617; GFX1264-NEXT:    s_endpgm
618;
619; GFX1232-LABEL: add_i32_uniform:
620; GFX1232:       ; %bb.0: ; %entry
621; GFX1232-NEXT:    s_clause 0x1
622; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
623; GFX1232-NEXT:    s_load_b32 s4, s[4:5], 0x34
624; GFX1232-NEXT:    s_mov_b32 s6, exec_lo
625; GFX1232-NEXT:    s_mov_b32 s5, exec_lo
626; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
627; GFX1232-NEXT:    ; implicit-def: $vgpr1
628; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
629; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v0
630; GFX1232-NEXT:    s_cbranch_execz .LBB1_2
631; GFX1232-NEXT:  ; %bb.1:
632; GFX1232-NEXT:    s_wait_alu 0xfffe
633; GFX1232-NEXT:    s_bcnt1_i32_b32 s6, s6
634; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
635; GFX1232-NEXT:    s_wait_kmcnt 0x0
636; GFX1232-NEXT:    s_wait_alu 0xfffe
637; GFX1232-NEXT:    s_mul_i32 s6, s4, s6
638; GFX1232-NEXT:    s_mov_b32 s10, -1
639; GFX1232-NEXT:    s_wait_alu 0xfffe
640; GFX1232-NEXT:    v_mov_b32_e32 v1, s6
641; GFX1232-NEXT:    s_mov_b32 s8, s2
642; GFX1232-NEXT:    s_mov_b32 s9, s3
643; GFX1232-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
644; GFX1232-NEXT:    s_wait_loadcnt 0x0
645; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
646; GFX1232-NEXT:  .LBB1_2:
647; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s5
648; GFX1232-NEXT:    s_wait_kmcnt 0x0
649; GFX1232-NEXT:    v_readfirstlane_b32 s2, v1
650; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
651; GFX1232-NEXT:    v_mad_co_u64_u32 v[0:1], null, s4, v0, s[2:3]
652; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
653; GFX1232-NEXT:    s_mov_b32 s2, -1
654; GFX1232-NEXT:    buffer_store_b32 v0, off, s[0:3], null
655; GFX1232-NEXT:    s_endpgm
656entry:
657  %old = atomicrmw add ptr addrspace(1) %inout, i32 %additive syncscope("agent") acq_rel
658  store i32 %old, ptr addrspace(1) %out
659  ret void
660}
661
662define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
663; GFX7LESS_ITERATIVE-LABEL: add_i32_varying:
664; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
665; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
666; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s6, 0
667; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
668; GFX7LESS_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
669; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
670; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
671; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s2
672; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
673; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
674; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
675; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
676; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
677; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
678; GFX7LESS_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
679; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB2_1
680; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
681; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
682; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
683; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
684; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
685; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
686; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
687; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
688; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
689; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
690; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
691; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s10, -1
692; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
693; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s8, s2
694; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s9, s3
695; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
696; GFX7LESS_ITERATIVE-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
697; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
698; GFX7LESS_ITERATIVE-NEXT:    buffer_wbinvl1
699; GFX7LESS_ITERATIVE-NEXT:  .LBB2_4:
700; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
701; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
702; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
703; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
704; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
705; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt expcnt(0)
706; GFX7LESS_ITERATIVE-NEXT:    v_add_i32_e32 v0, vcc, s4, v1
707; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
708; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
709;
710; GFX8_ITERATIVE-LABEL: add_i32_varying:
711; GFX8_ITERATIVE:       ; %bb.0: ; %entry
712; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
713; GFX8_ITERATIVE-NEXT:    s_mov_b32 s6, 0
714; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
715; GFX8_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
716; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
717; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
718; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s2
719; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
720; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
721; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
722; GFX8_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
723; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
724; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
725; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
726; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
727; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
728; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
729; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
730; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
731; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
732; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
733; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
734; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
735; GFX8_ITERATIVE-NEXT:  ; %bb.3:
736; GFX8_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
737; GFX8_ITERATIVE-NEXT:    s_mov_b32 s10, -1
738; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
739; GFX8_ITERATIVE-NEXT:    s_mov_b32 s8, s2
740; GFX8_ITERATIVE-NEXT:    s_mov_b32 s9, s3
741; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
742; GFX8_ITERATIVE-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
743; GFX8_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
744; GFX8_ITERATIVE-NEXT:    buffer_wbinvl1_vol
745; GFX8_ITERATIVE-NEXT:  .LBB2_4:
746; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
747; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
748; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
749; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
750; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
751; GFX8_ITERATIVE-NEXT:    v_add_u32_e32 v0, vcc, s4, v1
752; GFX8_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
753; GFX8_ITERATIVE-NEXT:    s_endpgm
754;
755; GFX9_ITERATIVE-LABEL: add_i32_varying:
756; GFX9_ITERATIVE:       ; %bb.0: ; %entry
757; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
758; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, 0
759; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
760; GFX9_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
761; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
762; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
763; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s2
764; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
765; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
766; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
767; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
768; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
769; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
770; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
771; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
772; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
773; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
774; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
775; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
776; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
777; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
778; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
779; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
780; GFX9_ITERATIVE-NEXT:  ; %bb.3:
781; GFX9_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
782; GFX9_ITERATIVE-NEXT:    s_mov_b32 s10, -1
783; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
784; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s2
785; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s3
786; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
787; GFX9_ITERATIVE-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
788; GFX9_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
789; GFX9_ITERATIVE-NEXT:    buffer_wbinvl1_vol
790; GFX9_ITERATIVE-NEXT:  .LBB2_4:
791; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
792; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
793; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
794; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
795; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
796; GFX9_ITERATIVE-NEXT:    v_add_u32_e32 v0, s4, v1
797; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
798; GFX9_ITERATIVE-NEXT:    s_endpgm
799;
800; GFX1064_ITERATIVE-LABEL: add_i32_varying:
801; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
802; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
803; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, 0
804; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
805; GFX1064_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
806; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
807; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s7, s[0:1]
808; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s7
809; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
810; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s7
811; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
812; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
813; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
814; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
815; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
816; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
817; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
818; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
819; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
820; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
821; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
822; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
823; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
824; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
825; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
826; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
827; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s10, -1
828; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
829; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s2
830; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s3
831; GFX1064_ITERATIVE-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
832; GFX1064_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
833; GFX1064_ITERATIVE-NEXT:    buffer_gl1_inv
834; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
835; GFX1064_ITERATIVE-NEXT:  .LBB2_4:
836; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
837; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
838; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
839; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
840; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
841; GFX1064_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v1
842; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
843; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
844; GFX1064_ITERATIVE-NEXT:    s_endpgm
845;
846; GFX1032_ITERATIVE-LABEL: add_i32_varying:
847; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
848; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
849; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s6, 0
850; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
851; GFX1032_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
852; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
853; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
854; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
855; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
856; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
857; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s3
858; GFX1032_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
859; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
860; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
861; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
862; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
863; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
864; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
865; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
866; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s4, vcc_lo
867; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s4, exec_lo, s4
868; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
869; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
870; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
871; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
872; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s10, -1
873; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
874; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s2
875; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s3
876; GFX1032_ITERATIVE-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
877; GFX1032_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
878; GFX1032_ITERATIVE-NEXT:    buffer_gl1_inv
879; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
880; GFX1032_ITERATIVE-NEXT:  .LBB2_4:
881; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
882; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s4
883; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
884; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
885; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
886; GFX1032_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v1
887; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
888; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
889; GFX1032_ITERATIVE-NEXT:    s_endpgm
890;
891; GFX1164_ITERATIVE-LABEL: add_i32_varying:
892; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
893; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
894; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
895; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s6, 0
896; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
897; GFX1164_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
898; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
899; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
900; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
901; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
902; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
903; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
904; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
905; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
906; GFX1164_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
907; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
908; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
909; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
910; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
911; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
912; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
913; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
914; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
915; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
916; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
917; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
918; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
919; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
920; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
921; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s6
922; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
923; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s10, -1
924; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
925; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s8, s2
926; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s9, s3
927; GFX1164_ITERATIVE-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
928; GFX1164_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
929; GFX1164_ITERATIVE-NEXT:    buffer_gl1_inv
930; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
931; GFX1164_ITERATIVE-NEXT:  .LBB2_4:
932; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
933; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
934; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
935; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
936; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
937; GFX1164_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
938; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
939; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
940; GFX1164_ITERATIVE-NEXT:    s_endpgm
941;
942; GFX1132_ITERATIVE-LABEL: add_i32_varying:
943; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
944; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
945; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
946; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s6, 0
947; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
948; GFX1132_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
949; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
950; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
951; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
952; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
953; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
954; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
955; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
956; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
957; GFX1132_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
958; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
959; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
960; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
961; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
962; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
963; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
964; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
965; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
966; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s4, vcc_lo
967; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s4, exec_lo, s4
968; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
969; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
970; GFX1132_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s6
971; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
972; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s10, -1
973; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
974; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s8, s2
975; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s9, s3
976; GFX1132_ITERATIVE-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
977; GFX1132_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
978; GFX1132_ITERATIVE-NEXT:    buffer_gl1_inv
979; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
980; GFX1132_ITERATIVE-NEXT:  .LBB2_4:
981; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s4
982; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
983; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
984; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
985; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
986; GFX1132_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
987; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
988; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
989; GFX1132_ITERATIVE-NEXT:    s_endpgm
990;
991; GFX1264_ITERATIVE-LABEL: add_i32_varying:
992; GFX1264_ITERATIVE:       ; %bb.0: ; %entry
993; GFX1264_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
994; GFX1264_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
995; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s6, 0
996; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
997; GFX1264_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
998; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
999; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
1000; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
1001; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
1002; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
1003; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
1004; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
1005; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1006; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s8
1007; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
1008; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
1009; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
1010; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1011; GFX1264_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
1012; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1013; GFX1264_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
1014; GFX1264_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1015; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
1016; GFX1264_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1017; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1018; GFX1264_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
1019; GFX1264_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
1020; GFX1264_ITERATIVE-NEXT:  ; %bb.3:
1021; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
1022; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s6
1023; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
1024; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s10, -1
1025; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
1026; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s8, s2
1027; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s9, s3
1028; GFX1264_ITERATIVE-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1029; GFX1264_ITERATIVE-NEXT:    s_wait_loadcnt 0x0
1030; GFX1264_ITERATIVE-NEXT:    global_inv scope:SCOPE_DEV
1031; GFX1264_ITERATIVE-NEXT:  .LBB2_4:
1032; GFX1264_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
1033; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
1034; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
1035; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
1036; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1037; GFX1264_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
1038; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
1039; GFX1264_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
1040; GFX1264_ITERATIVE-NEXT:    s_endpgm
1041;
1042; GFX1232_ITERATIVE-LABEL: add_i32_varying:
1043; GFX1232_ITERATIVE:       ; %bb.0: ; %entry
1044; GFX1232_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
1045; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
1046; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s6, 0
1047; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
1048; GFX1232_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
1049; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
1050; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
1051; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
1052; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
1053; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
1054; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
1055; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
1056; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
1057; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1058; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s2
1059; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
1060; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
1061; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
1062; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
1063; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1064; GFX1232_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
1065; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1066; GFX1232_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
1067; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
1068; GFX1232_ITERATIVE-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1069; GFX1232_ITERATIVE-NEXT:    s_xor_b32 s4, exec_lo, s4
1070; GFX1232_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
1071; GFX1232_ITERATIVE-NEXT:  ; %bb.3:
1072; GFX1232_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s6
1073; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
1074; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s10, -1
1075; GFX1232_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
1076; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s8, s2
1077; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s9, s3
1078; GFX1232_ITERATIVE-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1079; GFX1232_ITERATIVE-NEXT:    s_wait_loadcnt 0x0
1080; GFX1232_ITERATIVE-NEXT:    global_inv scope:SCOPE_DEV
1081; GFX1232_ITERATIVE-NEXT:  .LBB2_4:
1082; GFX1232_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1083; GFX1232_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
1084; GFX1232_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
1085; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
1086; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1087; GFX1232_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
1088; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s2, -1
1089; GFX1232_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
1090; GFX1232_ITERATIVE-NEXT:    s_endpgm
1091;
1092; GFX7LESS_DPP-LABEL: add_i32_varying:
1093; GFX7LESS_DPP:       ; %bb.0: ; %entry
1094; GFX7LESS_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1095; GFX7LESS_DPP-NEXT:    s_mov_b32 s7, 0xf000
1096; GFX7LESS_DPP-NEXT:    s_mov_b32 s6, -1
1097; GFX7LESS_DPP-NEXT:    s_mov_b32 s10, s6
1098; GFX7LESS_DPP-NEXT:    s_mov_b32 s11, s7
1099; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1100; GFX7LESS_DPP-NEXT:    s_mov_b32 s8, s2
1101; GFX7LESS_DPP-NEXT:    s_mov_b32 s9, s3
1102; GFX7LESS_DPP-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
1103; GFX7LESS_DPP-NEXT:    s_waitcnt vmcnt(0)
1104; GFX7LESS_DPP-NEXT:    buffer_wbinvl1
1105; GFX7LESS_DPP-NEXT:    s_mov_b32 s4, s0
1106; GFX7LESS_DPP-NEXT:    s_mov_b32 s5, s1
1107; GFX7LESS_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1108; GFX7LESS_DPP-NEXT:    s_endpgm
1109;
1110; GFX8_DPP-LABEL: add_i32_varying:
1111; GFX8_DPP:       ; %bb.0: ; %entry
1112; GFX8_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1113; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
1114; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
1115; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
1116; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1117; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1118; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
1119; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[4:5]
1120; GFX8_DPP-NEXT:    s_nop 1
1121; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1122; GFX8_DPP-NEXT:    s_nop 1
1123; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1124; GFX8_DPP-NEXT:    s_nop 1
1125; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1126; GFX8_DPP-NEXT:    s_nop 1
1127; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1128; GFX8_DPP-NEXT:    s_nop 1
1129; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1130; GFX8_DPP-NEXT:    s_nop 1
1131; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1132; GFX8_DPP-NEXT:    v_readlane_b32 s6, v2, 63
1133; GFX8_DPP-NEXT:    s_nop 0
1134; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1135; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
1136; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1137; GFX8_DPP-NEXT:    ; implicit-def: $vgpr0
1138; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1139; GFX8_DPP-NEXT:    s_cbranch_execz .LBB2_2
1140; GFX8_DPP-NEXT:  ; %bb.1:
1141; GFX8_DPP-NEXT:    s_mov_b32 s11, 0xf000
1142; GFX8_DPP-NEXT:    s_mov_b32 s10, -1
1143; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1144; GFX8_DPP-NEXT:    s_mov_b32 s8, s2
1145; GFX8_DPP-NEXT:    s_mov_b32 s9, s3
1146; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s6
1147; GFX8_DPP-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
1148; GFX8_DPP-NEXT:    s_waitcnt vmcnt(0)
1149; GFX8_DPP-NEXT:    buffer_wbinvl1_vol
1150; GFX8_DPP-NEXT:  .LBB2_2:
1151; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
1152; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v0
1153; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, v1
1154; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1155; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
1156; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
1157; GFX8_DPP-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
1158; GFX8_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1159; GFX8_DPP-NEXT:    s_endpgm
1160;
1161; GFX9_DPP-LABEL: add_i32_varying:
1162; GFX9_DPP:       ; %bb.0: ; %entry
1163; GFX9_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1164; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
1165; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
1166; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
1167; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1168; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1169; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
1170; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[4:5]
1171; GFX9_DPP-NEXT:    s_nop 1
1172; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1173; GFX9_DPP-NEXT:    s_nop 1
1174; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1175; GFX9_DPP-NEXT:    s_nop 1
1176; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1177; GFX9_DPP-NEXT:    s_nop 1
1178; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1179; GFX9_DPP-NEXT:    s_nop 1
1180; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1181; GFX9_DPP-NEXT:    s_nop 1
1182; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1183; GFX9_DPP-NEXT:    v_readlane_b32 s6, v2, 63
1184; GFX9_DPP-NEXT:    s_nop 0
1185; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1186; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
1187; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1188; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
1189; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1190; GFX9_DPP-NEXT:    s_cbranch_execz .LBB2_2
1191; GFX9_DPP-NEXT:  ; %bb.1:
1192; GFX9_DPP-NEXT:    s_mov_b32 s11, 0xf000
1193; GFX9_DPP-NEXT:    s_mov_b32 s10, -1
1194; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1195; GFX9_DPP-NEXT:    s_mov_b32 s8, s2
1196; GFX9_DPP-NEXT:    s_mov_b32 s9, s3
1197; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s6
1198; GFX9_DPP-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
1199; GFX9_DPP-NEXT:    s_waitcnt vmcnt(0)
1200; GFX9_DPP-NEXT:    buffer_wbinvl1_vol
1201; GFX9_DPP-NEXT:  .LBB2_2:
1202; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
1203; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
1204; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v1
1205; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1206; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
1207; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
1208; GFX9_DPP-NEXT:    v_add_u32_e32 v0, s4, v0
1209; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1210; GFX9_DPP-NEXT:    s_endpgm
1211;
1212; GFX1064_DPP-LABEL: add_i32_varying:
1213; GFX1064_DPP:       ; %bb.0: ; %entry
1214; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
1215; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
1216; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
1217; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1218; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1219; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1220; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1221; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
1222; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1223; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
1224; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s2
1225; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1226; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 15
1227; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1228; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
1229; GFX1064_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1230; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
1231; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
1232; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s6, 16
1233; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
1234; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1235; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
1236; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
1237; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v1, 63
1238; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s7, 32
1239; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
1240; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1241; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
1242; GFX1064_DPP-NEXT:    s_mov_b32 s4, s9
1243; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s8, 48
1244; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
1245; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1246; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
1247; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
1248; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
1249; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB2_2
1250; GFX1064_DPP-NEXT:  ; %bb.1:
1251; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s4
1252; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
1253; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1254; GFX1064_DPP-NEXT:    s_mov_b32 s4, s2
1255; GFX1064_DPP-NEXT:    s_mov_b32 s5, s3
1256; GFX1064_DPP-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
1257; GFX1064_DPP-NEXT:    s_waitcnt vmcnt(0)
1258; GFX1064_DPP-NEXT:    buffer_gl1_inv
1259; GFX1064_DPP-NEXT:    buffer_gl0_inv
1260; GFX1064_DPP-NEXT:  .LBB2_2:
1261; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
1262; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
1263; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1264; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s2, v0
1265; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
1266; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
1267; GFX1064_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
1268; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
1269; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1270; GFX1064_DPP-NEXT:    s_endpgm
1271;
1272; GFX1032_DPP-LABEL: add_i32_varying:
1273; GFX1032_DPP:       ; %bb.0: ; %entry
1274; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
1275; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
1276; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1277; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1278; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1279; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1280; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
1281; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
1282; GFX1032_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1283; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
1284; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1285; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
1286; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v1, 31
1287; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1288; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
1289; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
1290; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1291; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
1292; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s5, 16
1293; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
1294; GFX1032_DPP-NEXT:    s_mov_b32 s4, s6
1295; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
1296; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1297; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
1298; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
1299; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB2_2
1300; GFX1032_DPP-NEXT:  ; %bb.1:
1301; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s4
1302; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
1303; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1304; GFX1032_DPP-NEXT:    s_mov_b32 s4, s2
1305; GFX1032_DPP-NEXT:    s_mov_b32 s5, s3
1306; GFX1032_DPP-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
1307; GFX1032_DPP-NEXT:    s_waitcnt vmcnt(0)
1308; GFX1032_DPP-NEXT:    buffer_gl1_inv
1309; GFX1032_DPP-NEXT:    buffer_gl0_inv
1310; GFX1032_DPP-NEXT:  .LBB2_2:
1311; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
1312; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
1313; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1314; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s2, v0
1315; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
1316; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
1317; GFX1032_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
1318; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
1319; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1320; GFX1032_DPP-NEXT:    s_endpgm
1321;
1322; GFX1164_DPP-LABEL: add_i32_varying:
1323; GFX1164_DPP:       ; %bb.0: ; %entry
1324; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1325; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
1326; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1327; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
1328; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
1329; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1330; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1331; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1332; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1333; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1334; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1335; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1336; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
1337; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1338; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1339; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
1340; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s2
1341; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1342; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1343; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 15
1344; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1345; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
1346; GFX1164_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1347; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
1348; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
1349; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s6, 16
1350; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
1351; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1352; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1353; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
1354; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
1355; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v1, 63
1356; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s7, 32
1357; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
1358; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1359; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1360; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
1361; GFX1164_DPP-NEXT:    s_mov_b32 s4, s9
1362; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s8, 48
1363; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
1364; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1365; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
1366; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr0
1367; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
1368; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB2_2
1369; GFX1164_DPP-NEXT:  ; %bb.1:
1370; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, s4
1371; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
1372; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1373; GFX1164_DPP-NEXT:    s_mov_b32 s4, s2
1374; GFX1164_DPP-NEXT:    s_mov_b32 s5, s3
1375; GFX1164_DPP-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
1376; GFX1164_DPP-NEXT:    s_waitcnt vmcnt(0)
1377; GFX1164_DPP-NEXT:    buffer_gl1_inv
1378; GFX1164_DPP-NEXT:    buffer_gl0_inv
1379; GFX1164_DPP-NEXT:  .LBB2_2:
1380; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
1381; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1382; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s2, v0
1383; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, v3
1384; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
1385; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1386; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
1387; GFX1164_DPP-NEXT:    s_mov_b32 s2, s6
1388; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1389; GFX1164_DPP-NEXT:    s_endpgm
1390;
1391; GFX1132_DPP-LABEL: add_i32_varying:
1392; GFX1132_DPP:       ; %bb.0: ; %entry
1393; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1394; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
1395; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1396; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
1397; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1398; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1399; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1400; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1401; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1402; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1403; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1404; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
1405; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
1406; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1407; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
1408; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1409; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
1410; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1411; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 31
1412; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1413; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
1414; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
1415; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1416; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1417; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
1418; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s5, 16
1419; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
1420; GFX1132_DPP-NEXT:    s_mov_b32 s4, s6
1421; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
1422; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1423; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
1424; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
1425; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB2_2
1426; GFX1132_DPP-NEXT:  ; %bb.1:
1427; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s4
1428; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
1429; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1430; GFX1132_DPP-NEXT:    s_mov_b32 s4, s2
1431; GFX1132_DPP-NEXT:    s_mov_b32 s5, s3
1432; GFX1132_DPP-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
1433; GFX1132_DPP-NEXT:    s_waitcnt vmcnt(0)
1434; GFX1132_DPP-NEXT:    buffer_gl1_inv
1435; GFX1132_DPP-NEXT:    buffer_gl0_inv
1436; GFX1132_DPP-NEXT:  .LBB2_2:
1437; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
1438; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1439; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s2, v0
1440; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
1441; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
1442; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1443; GFX1132_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
1444; GFX1132_DPP-NEXT:    s_mov_b32 s2, s6
1445; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1446; GFX1132_DPP-NEXT:    s_endpgm
1447;
1448; GFX1264_DPP-LABEL: add_i32_varying:
1449; GFX1264_DPP:       ; %bb.0: ; %entry
1450; GFX1264_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1451; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
1452; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1453; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
1454; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, 0
1455; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1456; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1457; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1458; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1459; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1460; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1461; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1462; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
1463; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1464; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1465; GFX1264_DPP-NEXT:    v_readlane_b32 s2, v1, 31
1466; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, s2
1467; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1468; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1469; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v1, 15
1470; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1471; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
1472; GFX1264_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1473; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
1474; GFX1264_DPP-NEXT:    v_readlane_b32 s7, v1, 31
1475; GFX1264_DPP-NEXT:    v_writelane_b32 v3, s6, 16
1476; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[4:5]
1477; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1478; GFX1264_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1479; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
1480; GFX1264_DPP-NEXT:    v_readlane_b32 s8, v1, 47
1481; GFX1264_DPP-NEXT:    v_readlane_b32 s9, v1, 63
1482; GFX1264_DPP-NEXT:    v_writelane_b32 v3, s7, 32
1483; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[4:5]
1484; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1485; GFX1264_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1486; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
1487; GFX1264_DPP-NEXT:    s_mov_b32 s4, s9
1488; GFX1264_DPP-NEXT:    v_writelane_b32 v3, s8, 48
1489; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
1490; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[6:7]
1491; GFX1264_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1492; GFX1264_DPP-NEXT:    s_mov_b32 s6, -1
1493; GFX1264_DPP-NEXT:    ; implicit-def: $vgpr0
1494; GFX1264_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
1495; GFX1264_DPP-NEXT:    s_cbranch_execz .LBB2_2
1496; GFX1264_DPP-NEXT:  ; %bb.1:
1497; GFX1264_DPP-NEXT:    v_mov_b32_e32 v0, s4
1498; GFX1264_DPP-NEXT:    s_mov_b32 s7, 0x31016000
1499; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
1500; GFX1264_DPP-NEXT:    s_mov_b32 s4, s2
1501; GFX1264_DPP-NEXT:    s_mov_b32 s5, s3
1502; GFX1264_DPP-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1503; GFX1264_DPP-NEXT:    s_wait_loadcnt 0x0
1504; GFX1264_DPP-NEXT:    global_inv scope:SCOPE_DEV
1505; GFX1264_DPP-NEXT:  .LBB2_2:
1506; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
1507; GFX1264_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
1508; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
1509; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s2, v0
1510; GFX1264_DPP-NEXT:    v_mov_b32_e32 v0, v3
1511; GFX1264_DPP-NEXT:    s_mov_b32 s3, 0x31016000
1512; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1513; GFX1264_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
1514; GFX1264_DPP-NEXT:    s_mov_b32 s2, s6
1515; GFX1264_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], null
1516; GFX1264_DPP-NEXT:    s_endpgm
1517;
1518; GFX1232_DPP-LABEL: add_i32_varying:
1519; GFX1232_DPP:       ; %bb.0: ; %entry
1520; GFX1232_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1521; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
1522; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1523; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
1524; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1525; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1526; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1527; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1528; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1529; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1530; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1531; GFX1232_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
1532; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
1533; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1534; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
1535; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1536; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, 0
1537; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1538; GFX1232_DPP-NEXT:    v_readlane_b32 s6, v1, 31
1539; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1540; GFX1232_DPP-NEXT:    v_readlane_b32 s5, v1, 15
1541; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
1542; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1543; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1544; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
1545; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
1546; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
1547; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
1548; GFX1232_DPP-NEXT:    s_mov_b32 s4, s6
1549; GFX1232_DPP-NEXT:    s_mov_b32 s6, -1
1550; GFX1232_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1551; GFX1232_DPP-NEXT:    ; implicit-def: $vgpr0
1552; GFX1232_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
1553; GFX1232_DPP-NEXT:    s_cbranch_execz .LBB2_2
1554; GFX1232_DPP-NEXT:  ; %bb.1:
1555; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
1556; GFX1232_DPP-NEXT:    v_mov_b32_e32 v0, s4
1557; GFX1232_DPP-NEXT:    s_mov_b32 s7, 0x31016000
1558; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
1559; GFX1232_DPP-NEXT:    s_mov_b32 s4, s2
1560; GFX1232_DPP-NEXT:    s_mov_b32 s5, s3
1561; GFX1232_DPP-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1562; GFX1232_DPP-NEXT:    s_wait_loadcnt 0x0
1563; GFX1232_DPP-NEXT:    global_inv scope:SCOPE_DEV
1564; GFX1232_DPP-NEXT:  .LBB2_2:
1565; GFX1232_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
1566; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
1567; GFX1232_DPP-NEXT:    v_readfirstlane_b32 s2, v0
1568; GFX1232_DPP-NEXT:    v_mov_b32_e32 v0, v3
1569; GFX1232_DPP-NEXT:    s_mov_b32 s3, 0x31016000
1570; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1571; GFX1232_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
1572; GFX1232_DPP-NEXT:    s_mov_b32 s2, s6
1573; GFX1232_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], null
1574; GFX1232_DPP-NEXT:    s_endpgm
1575entry:
1576  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1577  %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane  syncscope("agent") acq_rel
1578  store i32 %old, ptr addrspace(1) %out
1579  ret void
1580}
1581
1582define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
1583; GFX7LESS-LABEL: add_i64_constant:
1584; GFX7LESS:       ; %bb.0: ; %entry
1585; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1586; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1587; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1588; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
1589; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1590; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1591; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1592; GFX7LESS-NEXT:    s_cbranch_execz .LBB3_2
1593; GFX7LESS-NEXT:  ; %bb.1:
1594; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
1595; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1596; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
1597; GFX7LESS-NEXT:    s_mov_b32 s10, -1
1598; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1599; GFX7LESS-NEXT:    s_mov_b32 s8, s2
1600; GFX7LESS-NEXT:    s_mov_b32 s9, s3
1601; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1602; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1603; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1604; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1605; GFX7LESS-NEXT:    buffer_wbinvl1
1606; GFX7LESS-NEXT:  .LBB3_2:
1607; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1608; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1609; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1610; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1611; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
1612; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v0
1613; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
1614; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1615; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1616; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1617; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
1618; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1619; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1620; GFX7LESS-NEXT:    s_endpgm
1621;
1622; GFX8-LABEL: add_i64_constant:
1623; GFX8:       ; %bb.0: ; %entry
1624; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1625; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1626; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1627; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1628; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1629; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1630; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1631; GFX8-NEXT:    s_cbranch_execz .LBB3_2
1632; GFX8-NEXT:  ; %bb.1:
1633; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1634; GFX8-NEXT:    s_mov_b32 s8, s2
1635; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
1636; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1637; GFX8-NEXT:    s_mov_b32 s11, 0xf000
1638; GFX8-NEXT:    s_mov_b32 s10, -1
1639; GFX8-NEXT:    s_mov_b32 s9, s3
1640; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1641; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1642; GFX8-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1643; GFX8-NEXT:    s_waitcnt vmcnt(0)
1644; GFX8-NEXT:    buffer_wbinvl1_vol
1645; GFX8-NEXT:  .LBB3_2:
1646; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1647; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1648; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1649; GFX8-NEXT:    v_readfirstlane_b32 s3, v0
1650; GFX8-NEXT:    v_mov_b32_e32 v0, s3
1651; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1652; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1653; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1654; GFX8-NEXT:    s_mov_b32 s2, -1
1655; GFX8-NEXT:    s_nop 2
1656; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1657; GFX8-NEXT:    s_endpgm
1658;
1659; GFX9-LABEL: add_i64_constant:
1660; GFX9:       ; %bb.0: ; %entry
1661; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1662; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1663; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1664; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1665; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1666; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1667; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1668; GFX9-NEXT:    s_cbranch_execz .LBB3_2
1669; GFX9-NEXT:  ; %bb.1:
1670; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1671; GFX9-NEXT:    s_mov_b32 s8, s2
1672; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
1673; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1674; GFX9-NEXT:    s_mov_b32 s11, 0xf000
1675; GFX9-NEXT:    s_mov_b32 s10, -1
1676; GFX9-NEXT:    s_mov_b32 s9, s3
1677; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1678; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1679; GFX9-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1680; GFX9-NEXT:    s_waitcnt vmcnt(0)
1681; GFX9-NEXT:    buffer_wbinvl1_vol
1682; GFX9-NEXT:  .LBB3_2:
1683; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1684; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1685; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1686; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
1687; GFX9-NEXT:    v_mov_b32_e32 v0, s3
1688; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1689; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1690; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1691; GFX9-NEXT:    s_mov_b32 s2, -1
1692; GFX9-NEXT:    s_nop 2
1693; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1694; GFX9-NEXT:    s_endpgm
1695;
1696; GFX1064-LABEL: add_i64_constant:
1697; GFX1064:       ; %bb.0: ; %entry
1698; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1699; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1700; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1701; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1702; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1703; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1704; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1705; GFX1064-NEXT:    s_cbranch_execz .LBB3_2
1706; GFX1064-NEXT:  ; %bb.1:
1707; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1708; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1709; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
1710; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
1711; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
1712; GFX1064-NEXT:    s_mov_b32 s10, -1
1713; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1714; GFX1064-NEXT:    s_mov_b32 s8, s2
1715; GFX1064-NEXT:    s_mov_b32 s9, s3
1716; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1717; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1718; GFX1064-NEXT:    buffer_gl1_inv
1719; GFX1064-NEXT:    buffer_gl0_inv
1720; GFX1064-NEXT:  .LBB3_2:
1721; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1722; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1723; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1724; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
1725; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
1726; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
1727; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1728; GFX1064-NEXT:    s_mov_b32 s2, -1
1729; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1730; GFX1064-NEXT:    s_endpgm
1731;
1732; GFX1032-LABEL: add_i64_constant:
1733; GFX1032:       ; %bb.0: ; %entry
1734; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1735; GFX1032-NEXT:    s_mov_b32 s6, exec_lo
1736; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1737; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
1738; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1739; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1740; GFX1032-NEXT:    s_cbranch_execz .LBB3_2
1741; GFX1032-NEXT:  ; %bb.1:
1742; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s6
1743; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1744; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
1745; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
1746; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
1747; GFX1032-NEXT:    s_mov_b32 s10, -1
1748; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1749; GFX1032-NEXT:    s_mov_b32 s8, s2
1750; GFX1032-NEXT:    s_mov_b32 s9, s3
1751; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1752; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1753; GFX1032-NEXT:    buffer_gl1_inv
1754; GFX1032-NEXT:    buffer_gl0_inv
1755; GFX1032-NEXT:  .LBB3_2:
1756; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1757; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1758; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1759; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
1760; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
1761; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
1762; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1763; GFX1032-NEXT:    s_mov_b32 s2, -1
1764; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1765; GFX1032-NEXT:    s_endpgm
1766;
1767; GFX1164-LABEL: add_i64_constant:
1768; GFX1164:       ; %bb.0: ; %entry
1769; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1770; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
1771; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1772; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1773; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1774; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1775; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1776; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1777; GFX1164-NEXT:    s_cbranch_execz .LBB3_2
1778; GFX1164-NEXT:  ; %bb.1:
1779; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1780; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1781; GFX1164-NEXT:    s_mul_i32 s6, s6, 5
1782; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
1783; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
1784; GFX1164-NEXT:    s_mov_b32 s10, -1
1785; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1786; GFX1164-NEXT:    s_mov_b32 s8, s2
1787; GFX1164-NEXT:    s_mov_b32 s9, s3
1788; GFX1164-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
1789; GFX1164-NEXT:    s_waitcnt vmcnt(0)
1790; GFX1164-NEXT:    buffer_gl1_inv
1791; GFX1164-NEXT:    buffer_gl0_inv
1792; GFX1164-NEXT:  .LBB3_2:
1793; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1794; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1795; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
1796; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
1797; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1798; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1799; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1800; GFX1164-NEXT:    s_mov_b32 s2, -1
1801; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1802; GFX1164-NEXT:    s_endpgm
1803;
1804; GFX1132-LABEL: add_i64_constant:
1805; GFX1132:       ; %bb.0: ; %entry
1806; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1807; GFX1132-NEXT:    s_mov_b32 s6, exec_lo
1808; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
1809; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
1810; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1811; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1812; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1813; GFX1132-NEXT:    s_cbranch_execz .LBB3_2
1814; GFX1132-NEXT:  ; %bb.1:
1815; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s6
1816; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
1817; GFX1132-NEXT:    s_mul_i32 s5, s5, 5
1818; GFX1132-NEXT:    s_mov_b32 s10, -1
1819; GFX1132-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
1820; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1821; GFX1132-NEXT:    s_mov_b32 s8, s2
1822; GFX1132-NEXT:    s_mov_b32 s9, s3
1823; GFX1132-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
1824; GFX1132-NEXT:    s_waitcnt vmcnt(0)
1825; GFX1132-NEXT:    buffer_gl1_inv
1826; GFX1132-NEXT:    buffer_gl0_inv
1827; GFX1132-NEXT:  .LBB3_2:
1828; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1829; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1830; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
1831; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
1832; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1833; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1834; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1835; GFX1132-NEXT:    s_mov_b32 s2, -1
1836; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1837; GFX1132-NEXT:    s_endpgm
1838;
1839; GFX1264-LABEL: add_i64_constant:
1840; GFX1264:       ; %bb.0: ; %entry
1841; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1842; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
1843; GFX1264-NEXT:    s_mov_b32 s9, 0
1844; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1845; GFX1264-NEXT:    s_mov_b64 s[4:5], exec
1846; GFX1264-NEXT:    s_wait_alu 0xfffe
1847; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1848; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1849; GFX1264-NEXT:    ; implicit-def: $vgpr0_vgpr1
1850; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v2
1851; GFX1264-NEXT:    s_cbranch_execz .LBB3_2
1852; GFX1264-NEXT:  ; %bb.1:
1853; GFX1264-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
1854; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
1855; GFX1264-NEXT:    s_mul_u64 s[6:7], s[8:9], 5
1856; GFX1264-NEXT:    s_mov_b32 s10, -1
1857; GFX1264-NEXT:    s_wait_alu 0xfffe
1858; GFX1264-NEXT:    v_mov_b32_e32 v0, s6
1859; GFX1264-NEXT:    v_mov_b32_e32 v1, s7
1860; GFX1264-NEXT:    s_wait_kmcnt 0x0
1861; GFX1264-NEXT:    s_mov_b32 s8, s2
1862; GFX1264-NEXT:    s_mov_b32 s9, s3
1863; GFX1264-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1864; GFX1264-NEXT:    s_wait_loadcnt 0x0
1865; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
1866; GFX1264-NEXT:  .LBB3_2:
1867; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
1868; GFX1264-NEXT:    s_wait_kmcnt 0x0
1869; GFX1264-NEXT:    v_readfirstlane_b32 s3, v1
1870; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
1871; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1872; GFX1264-NEXT:    v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3]
1873; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
1874; GFX1264-NEXT:    s_mov_b32 s2, -1
1875; GFX1264-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
1876; GFX1264-NEXT:    s_endpgm
1877;
1878; GFX1232-LABEL: add_i64_constant:
1879; GFX1232:       ; %bb.0: ; %entry
1880; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1881; GFX1232-NEXT:    s_mov_b32 s7, exec_lo
1882; GFX1232-NEXT:    s_mov_b32 s5, 0
1883; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v2, s7, 0
1884; GFX1232-NEXT:    s_mov_b32 s6, exec_lo
1885; GFX1232-NEXT:    ; implicit-def: $vgpr0_vgpr1
1886; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1887; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v2
1888; GFX1232-NEXT:    s_cbranch_execz .LBB3_2
1889; GFX1232-NEXT:  ; %bb.1:
1890; GFX1232-NEXT:    s_wait_alu 0xfffe
1891; GFX1232-NEXT:    s_bcnt1_i32_b32 s4, s7
1892; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
1893; GFX1232-NEXT:    s_mul_u64 s[4:5], s[4:5], 5
1894; GFX1232-NEXT:    s_mov_b32 s10, -1
1895; GFX1232-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
1896; GFX1232-NEXT:    s_wait_kmcnt 0x0
1897; GFX1232-NEXT:    s_mov_b32 s8, s2
1898; GFX1232-NEXT:    s_mov_b32 s9, s3
1899; GFX1232-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1900; GFX1232-NEXT:    s_wait_loadcnt 0x0
1901; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
1902; GFX1232-NEXT:  .LBB3_2:
1903; GFX1232-NEXT:    s_wait_alu 0xfffe
1904; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s6
1905; GFX1232-NEXT:    s_wait_kmcnt 0x0
1906; GFX1232-NEXT:    v_readfirstlane_b32 s3, v1
1907; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
1908; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1909; GFX1232-NEXT:    v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3]
1910; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
1911; GFX1232-NEXT:    s_mov_b32 s2, -1
1912; GFX1232-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
1913; GFX1232-NEXT:    s_endpgm
1914entry:
1915  %old = atomicrmw add ptr addrspace(1) %inout, i64 5  syncscope("agent") acq_rel
1916  store i64 %old, ptr addrspace(1) %out
1917  ret void
1918}
1919
1920define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 %additive) {
1921; GFX7LESS-LABEL: add_i64_uniform:
1922; GFX7LESS:       ; %bb.0: ; %entry
1923; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
1924; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1925; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1926; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
1927; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s9, v0
1928; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1929; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1930; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1931; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_2
1932; GFX7LESS-NEXT:  ; %bb.1:
1933; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
1934; GFX7LESS-NEXT:    s_mov_b32 s14, -1
1935; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1936; GFX7LESS-NEXT:    s_mov_b32 s12, s2
1937; GFX7LESS-NEXT:    s_mov_b32 s13, s3
1938; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
1939; GFX7LESS-NEXT:    s_mul_i32 s3, s5, s2
1940; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
1941; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s4, v0
1942; GFX7LESS-NEXT:    s_mul_i32 s2, s4, s2
1943; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s3, v0
1944; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
1945; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
1946; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1947; GFX7LESS-NEXT:    buffer_wbinvl1
1948; GFX7LESS-NEXT:  .LBB4_2:
1949; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
1950; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1951; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1952; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1953; GFX7LESS-NEXT:    v_readfirstlane_b32 s6, v1
1954; GFX7LESS-NEXT:    v_readfirstlane_b32 s7, v0
1955; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
1956; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s5, v2
1957; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s4, v2
1958; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s4, v2
1959; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
1960; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s6
1961; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s7, v2
1962; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
1963; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1964; GFX7LESS-NEXT:    s_endpgm
1965;
1966; GFX8-LABEL: add_i64_uniform:
1967; GFX8:       ; %bb.0: ; %entry
1968; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1969; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1970; GFX8-NEXT:    s_mov_b64 s[8:9], exec
1971; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
1972; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
1973; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1974; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1975; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1976; GFX8-NEXT:    s_cbranch_execz .LBB4_2
1977; GFX8-NEXT:  ; %bb.1:
1978; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1979; GFX8-NEXT:    s_mov_b32 s12, s2
1980; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
1981; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1982; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], s4, v0, 0
1983; GFX8-NEXT:    s_mul_i32 s2, s5, s2
1984; GFX8-NEXT:    s_mov_b32 s15, 0xf000
1985; GFX8-NEXT:    s_mov_b32 s14, -1
1986; GFX8-NEXT:    s_mov_b32 s13, s3
1987; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
1988; GFX8-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
1989; GFX8-NEXT:    s_waitcnt vmcnt(0)
1990; GFX8-NEXT:    buffer_wbinvl1_vol
1991; GFX8-NEXT:  .LBB4_2:
1992; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1993; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1994; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1995; GFX8-NEXT:    v_readfirstlane_b32 s3, v0
1996; GFX8-NEXT:    v_mov_b32_e32 v0, s3
1997; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1998; GFX8-NEXT:    v_mul_lo_u32 v3, s5, v2
1999; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s4, v2, v[0:1]
2000; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2001; GFX8-NEXT:    s_mov_b32 s2, -1
2002; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
2003; GFX8-NEXT:    s_nop 1
2004; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2005; GFX8-NEXT:    s_endpgm
2006;
2007; GFX9-LABEL: add_i64_uniform:
2008; GFX9:       ; %bb.0: ; %entry
2009; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2010; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2011; GFX9-NEXT:    s_mov_b64 s[8:9], exec
2012; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
2013; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
2014; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2015; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2016; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2017; GFX9-NEXT:    s_cbranch_execz .LBB4_2
2018; GFX9-NEXT:  ; %bb.1:
2019; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2020; GFX9-NEXT:    s_mov_b32 s12, s2
2021; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
2022; GFX9-NEXT:    s_mov_b32 s13, s3
2023; GFX9-NEXT:    s_mul_i32 s3, s7, s2
2024; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s2
2025; GFX9-NEXT:    s_add_i32 s8, s8, s3
2026; GFX9-NEXT:    s_mul_i32 s2, s6, s2
2027; GFX9-NEXT:    s_mov_b32 s15, 0xf000
2028; GFX9-NEXT:    s_mov_b32 s14, -1
2029; GFX9-NEXT:    v_mov_b32_e32 v0, s2
2030; GFX9-NEXT:    v_mov_b32_e32 v1, s8
2031; GFX9-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
2032; GFX9-NEXT:    s_waitcnt vmcnt(0)
2033; GFX9-NEXT:    buffer_wbinvl1_vol
2034; GFX9-NEXT:  .LBB4_2:
2035; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2036; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2037; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
2038; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
2039; GFX9-NEXT:    v_mov_b32_e32 v0, s3
2040; GFX9-NEXT:    v_mov_b32_e32 v1, s2
2041; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s6, v2, v[0:1]
2042; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s7, v2, v[1:2]
2043; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2044; GFX9-NEXT:    s_mov_b32 s2, -1
2045; GFX9-NEXT:    s_nop 2
2046; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2047; GFX9-NEXT:    s_endpgm
2048;
2049; GFX1064-LABEL: add_i64_uniform:
2050; GFX1064:       ; %bb.0: ; %entry
2051; GFX1064-NEXT:    s_clause 0x1
2052; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2053; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2054; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
2055; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
2056; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
2057; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
2058; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2059; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2060; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
2061; GFX1064-NEXT:  ; %bb.1:
2062; GFX1064-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
2063; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
2064; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2065; GFX1064-NEXT:    s_mul_i32 s9, s7, s8
2066; GFX1064-NEXT:    s_mul_hi_u32 s10, s6, s8
2067; GFX1064-NEXT:    s_mul_i32 s8, s6, s8
2068; GFX1064-NEXT:    s_add_i32 s10, s10, s9
2069; GFX1064-NEXT:    v_mov_b32_e32 v0, s8
2070; GFX1064-NEXT:    v_mov_b32_e32 v1, s10
2071; GFX1064-NEXT:    s_mov_b32 s10, -1
2072; GFX1064-NEXT:    s_mov_b32 s8, s2
2073; GFX1064-NEXT:    s_mov_b32 s9, s3
2074; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
2075; GFX1064-NEXT:    s_waitcnt vmcnt(0)
2076; GFX1064-NEXT:    buffer_gl1_inv
2077; GFX1064-NEXT:    buffer_gl0_inv
2078; GFX1064-NEXT:  .LBB4_2:
2079; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2080; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2081; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2082; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
2083; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
2084; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s6, v2, s[2:3]
2085; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s7, v2, v[1:2]
2086; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2087; GFX1064-NEXT:    s_mov_b32 s2, -1
2088; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2089; GFX1064-NEXT:    s_endpgm
2090;
2091; GFX1032-LABEL: add_i64_uniform:
2092; GFX1032:       ; %bb.0: ; %entry
2093; GFX1032-NEXT:    s_clause 0x1
2094; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2095; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2096; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
2097; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
2098; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s8, 0
2099; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2100; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2101; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
2102; GFX1032-NEXT:  ; %bb.1:
2103; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s8
2104; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
2105; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2106; GFX1032-NEXT:    s_mul_i32 s8, s7, s5
2107; GFX1032-NEXT:    s_mul_hi_u32 s9, s6, s5
2108; GFX1032-NEXT:    s_mul_i32 s5, s6, s5
2109; GFX1032-NEXT:    s_add_i32 s9, s9, s8
2110; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
2111; GFX1032-NEXT:    v_mov_b32_e32 v1, s9
2112; GFX1032-NEXT:    s_mov_b32 s10, -1
2113; GFX1032-NEXT:    s_mov_b32 s8, s2
2114; GFX1032-NEXT:    s_mov_b32 s9, s3
2115; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
2116; GFX1032-NEXT:    s_waitcnt vmcnt(0)
2117; GFX1032-NEXT:    buffer_gl1_inv
2118; GFX1032-NEXT:    buffer_gl0_inv
2119; GFX1032-NEXT:  .LBB4_2:
2120; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2121; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2122; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2123; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
2124; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
2125; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, s6, v2, s[2:3]
2126; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2127; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s2, s7, v2, v[1:2]
2128; GFX1032-NEXT:    s_mov_b32 s2, -1
2129; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2130; GFX1032-NEXT:    s_endpgm
2131;
2132; GFX1164-LABEL: add_i64_uniform:
2133; GFX1164:       ; %bb.0: ; %entry
2134; GFX1164-NEXT:    s_clause 0x1
2135; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2136; GFX1164-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2137; GFX1164-NEXT:    s_mov_b64 s[8:9], exec
2138; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
2139; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
2140; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2141; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
2142; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
2143; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
2144; GFX1164-NEXT:    s_cbranch_execz .LBB4_2
2145; GFX1164-NEXT:  ; %bb.1:
2146; GFX1164-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
2147; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
2148; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2149; GFX1164-NEXT:    s_mul_i32 s9, s5, s8
2150; GFX1164-NEXT:    s_mul_hi_u32 s10, s4, s8
2151; GFX1164-NEXT:    s_mul_i32 s8, s4, s8
2152; GFX1164-NEXT:    s_add_i32 s10, s10, s9
2153; GFX1164-NEXT:    v_mov_b32_e32 v0, s8
2154; GFX1164-NEXT:    v_mov_b32_e32 v1, s10
2155; GFX1164-NEXT:    s_mov_b32 s10, -1
2156; GFX1164-NEXT:    s_mov_b32 s8, s2
2157; GFX1164-NEXT:    s_mov_b32 s9, s3
2158; GFX1164-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
2159; GFX1164-NEXT:    s_waitcnt vmcnt(0)
2160; GFX1164-NEXT:    buffer_gl1_inv
2161; GFX1164-NEXT:    buffer_gl0_inv
2162; GFX1164-NEXT:  .LBB4_2:
2163; GFX1164-NEXT:    s_or_b64 exec, exec, s[6:7]
2164; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2165; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
2166; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
2167; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2168; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
2169; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2170; GFX1164-NEXT:    s_mov_b32 s2, -1
2171; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2]
2172; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2173; GFX1164-NEXT:    v_mov_b32_e32 v1, v3
2174; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2175; GFX1164-NEXT:    s_endpgm
2176;
2177; GFX1132-LABEL: add_i64_uniform:
2178; GFX1132:       ; %bb.0: ; %entry
2179; GFX1132-NEXT:    s_clause 0x1
2180; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2181; GFX1132-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2182; GFX1132-NEXT:    s_mov_b32 s7, exec_lo
2183; GFX1132-NEXT:    s_mov_b32 s6, exec_lo
2184; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s7, 0
2185; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
2186; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2187; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
2188; GFX1132-NEXT:    s_cbranch_execz .LBB4_2
2189; GFX1132-NEXT:  ; %bb.1:
2190; GFX1132-NEXT:    s_bcnt1_i32_b32 s7, s7
2191; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
2192; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2193; GFX1132-NEXT:    s_mul_i32 s8, s5, s7
2194; GFX1132-NEXT:    s_mul_hi_u32 s9, s4, s7
2195; GFX1132-NEXT:    s_mul_i32 s7, s4, s7
2196; GFX1132-NEXT:    s_add_i32 s9, s9, s8
2197; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2198; GFX1132-NEXT:    v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s9
2199; GFX1132-NEXT:    s_mov_b32 s10, -1
2200; GFX1132-NEXT:    s_mov_b32 s8, s2
2201; GFX1132-NEXT:    s_mov_b32 s9, s3
2202; GFX1132-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
2203; GFX1132-NEXT:    s_waitcnt vmcnt(0)
2204; GFX1132-NEXT:    buffer_gl1_inv
2205; GFX1132-NEXT:    buffer_gl0_inv
2206; GFX1132-NEXT:  .LBB4_2:
2207; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s6
2208; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2209; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
2210; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
2211; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2212; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
2213; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2214; GFX1132-NEXT:    s_mov_b32 s2, -1
2215; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2]
2216; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2217; GFX1132-NEXT:    v_mov_b32_e32 v1, v3
2218; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2219; GFX1132-NEXT:    s_endpgm
2220;
2221; GFX1264-LABEL: add_i64_uniform:
2222; GFX1264:       ; %bb.0: ; %entry
2223; GFX1264-NEXT:    s_clause 0x1
2224; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2225; GFX1264-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2226; GFX1264-NEXT:    s_mov_b64 s[8:9], exec
2227; GFX1264-NEXT:    s_mov_b32 s11, 0
2228; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
2229; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
2230; GFX1264-NEXT:    s_wait_alu 0xfffe
2231; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2232; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
2233; GFX1264-NEXT:    ; implicit-def: $vgpr0_vgpr1
2234; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v2
2235; GFX1264-NEXT:    s_cbranch_execz .LBB4_2
2236; GFX1264-NEXT:  ; %bb.1:
2237; GFX1264-NEXT:    s_bcnt1_i32_b64 s10, s[8:9]
2238; GFX1264-NEXT:    s_wait_kmcnt 0x0
2239; GFX1264-NEXT:    s_mul_u64 s[8:9], s[4:5], s[10:11]
2240; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
2241; GFX1264-NEXT:    s_wait_alu 0xfffe
2242; GFX1264-NEXT:    v_mov_b32_e32 v0, s8
2243; GFX1264-NEXT:    v_mov_b32_e32 v1, s9
2244; GFX1264-NEXT:    s_mov_b32 s10, -1
2245; GFX1264-NEXT:    s_mov_b32 s8, s2
2246; GFX1264-NEXT:    s_mov_b32 s9, s3
2247; GFX1264-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2248; GFX1264-NEXT:    s_wait_loadcnt 0x0
2249; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
2250; GFX1264-NEXT:  .LBB4_2:
2251; GFX1264-NEXT:    s_or_b64 exec, exec, s[6:7]
2252; GFX1264-NEXT:    s_wait_kmcnt 0x0
2253; GFX1264-NEXT:    v_readfirstlane_b32 s3, v1
2254; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
2255; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2256; GFX1264-NEXT:    v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3]
2257; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
2258; GFX1264-NEXT:    s_mov_b32 s2, -1
2259; GFX1264-NEXT:    v_mad_co_u64_u32 v[1:2], null, s5, v2, v[1:2]
2260; GFX1264-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
2261; GFX1264-NEXT:    s_endpgm
2262;
2263; GFX1232-LABEL: add_i64_uniform:
2264; GFX1232:       ; %bb.0: ; %entry
2265; GFX1232-NEXT:    s_clause 0x1
2266; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2267; GFX1232-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2268; GFX1232-NEXT:    s_mov_b32 s6, exec_lo
2269; GFX1232-NEXT:    s_mov_b32 s7, 0
2270; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
2271; GFX1232-NEXT:    s_mov_b32 s8, exec_lo
2272; GFX1232-NEXT:    ; implicit-def: $vgpr0_vgpr1
2273; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2274; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v2
2275; GFX1232-NEXT:    s_cbranch_execz .LBB4_2
2276; GFX1232-NEXT:  ; %bb.1:
2277; GFX1232-NEXT:    s_wait_alu 0xfffe
2278; GFX1232-NEXT:    s_bcnt1_i32_b32 s6, s6
2279; GFX1232-NEXT:    s_mov_b32 s15, 0x31016000
2280; GFX1232-NEXT:    s_wait_kmcnt 0x0
2281; GFX1232-NEXT:    s_wait_alu 0xfffe
2282; GFX1232-NEXT:    s_mul_u64 s[6:7], s[4:5], s[6:7]
2283; GFX1232-NEXT:    s_mov_b32 s14, -1
2284; GFX1232-NEXT:    s_wait_alu 0xfffe
2285; GFX1232-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
2286; GFX1232-NEXT:    s_mov_b32 s12, s2
2287; GFX1232-NEXT:    s_mov_b32 s13, s3
2288; GFX1232-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2289; GFX1232-NEXT:    s_wait_loadcnt 0x0
2290; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
2291; GFX1232-NEXT:  .LBB4_2:
2292; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s8
2293; GFX1232-NEXT:    s_wait_kmcnt 0x0
2294; GFX1232-NEXT:    v_readfirstlane_b32 s3, v1
2295; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
2296; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2297; GFX1232-NEXT:    v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3]
2298; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
2299; GFX1232-NEXT:    s_mov_b32 s2, -1
2300; GFX1232-NEXT:    v_mad_co_u64_u32 v[1:2], null, s5, v2, v[1:2]
2301; GFX1232-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
2302; GFX1232-NEXT:    s_endpgm
2303entry:
2304  %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive  syncscope("agent") acq_rel
2305  store i64 %old, ptr addrspace(1) %out
2306  ret void
2307}
2308
2309define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
2310; GFX7LESS_ITERATIVE-LABEL: add_i64_varying:
2311; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
2312; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
2313; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
2314; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
2315; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
2316; GFX7LESS_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
2317; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2318; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
2319; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s2
2320; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s2
2321; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s2
2322; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, m0
2323; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
2324; GFX7LESS_ITERATIVE-NEXT:    s_add_u32 s6, s6, s8
2325; GFX7LESS_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
2326; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
2327; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
2328; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
2329; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
2330; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB5_1
2331; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2332; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2333; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2334; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
2335; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2336; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
2337; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2338; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2339; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
2340; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
2341; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
2342; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s10, -1
2343; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2344; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s8, s2
2345; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s9, s3
2346; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s6
2347; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s7
2348; GFX7LESS_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
2349; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
2350; GFX7LESS_ITERATIVE-NEXT:    buffer_wbinvl1
2351; GFX7LESS_ITERATIVE-NEXT:  .LBB5_4:
2352; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
2353; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2354; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
2355; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2356; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
2357; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
2358; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt expcnt(0)
2359; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
2360; GFX7LESS_ITERATIVE-NEXT:    v_add_i32_e32 v0, vcc, s5, v1
2361; GFX7LESS_ITERATIVE-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
2362; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2363; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
2364;
2365; GFX8_ITERATIVE-LABEL: add_i64_varying:
2366; GFX8_ITERATIVE:       ; %bb.0: ; %entry
2367; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
2368; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
2369; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
2370; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
2371; GFX8_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
2372; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2373; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
2374; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s2
2375; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s2
2376; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s2
2377; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
2378; GFX8_ITERATIVE-NEXT:    s_add_u32 s6, s6, s8
2379; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, m0
2380; GFX8_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
2381; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
2382; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
2383; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
2384; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
2385; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2386; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2387; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2388; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2389; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2390; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
2391; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2392; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2393; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
2394; GFX8_ITERATIVE-NEXT:  ; %bb.3:
2395; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s6
2396; GFX8_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
2397; GFX8_ITERATIVE-NEXT:    s_mov_b32 s10, -1
2398; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2399; GFX8_ITERATIVE-NEXT:    s_mov_b32 s8, s2
2400; GFX8_ITERATIVE-NEXT:    s_mov_b32 s9, s3
2401; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s7
2402; GFX8_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
2403; GFX8_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
2404; GFX8_ITERATIVE-NEXT:    buffer_wbinvl1_vol
2405; GFX8_ITERATIVE-NEXT:  .LBB5_4:
2406; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
2407; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
2408; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
2409; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
2410; GFX8_ITERATIVE-NEXT:    v_add_u32_e32 v0, vcc, s5, v1
2411; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2412; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
2413; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2414; GFX8_ITERATIVE-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
2415; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2416; GFX8_ITERATIVE-NEXT:    s_endpgm
2417;
2418; GFX9_ITERATIVE-LABEL: add_i64_varying:
2419; GFX9_ITERATIVE:       ; %bb.0: ; %entry
2420; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
2421; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
2422; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
2423; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
2424; GFX9_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
2425; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2426; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
2427; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s2
2428; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s2
2429; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s2
2430; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
2431; GFX9_ITERATIVE-NEXT:    s_add_u32 s6, s6, s8
2432; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, m0
2433; GFX9_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
2434; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
2435; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
2436; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
2437; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
2438; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2439; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2440; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2441; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2442; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2443; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
2444; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2445; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2446; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
2447; GFX9_ITERATIVE-NEXT:  ; %bb.3:
2448; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s6
2449; GFX9_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
2450; GFX9_ITERATIVE-NEXT:    s_mov_b32 s10, -1
2451; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2452; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s2
2453; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s3
2454; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s7
2455; GFX9_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
2456; GFX9_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
2457; GFX9_ITERATIVE-NEXT:    buffer_wbinvl1_vol
2458; GFX9_ITERATIVE-NEXT:  .LBB5_4:
2459; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
2460; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
2461; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
2462; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
2463; GFX9_ITERATIVE-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v1
2464; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2465; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
2466; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2467; GFX9_ITERATIVE-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
2468; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2469; GFX9_ITERATIVE-NEXT:    s_endpgm
2470;
2471; GFX1064_ITERATIVE-LABEL: add_i64_varying:
2472; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
2473; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
2474; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
2475; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
2476; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
2477; GFX1064_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
2478; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2479; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
2480; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
2481; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s2
2482; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
2483; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s2
2484; GFX1064_ITERATIVE-NEXT:    s_add_u32 s6, s6, s3
2485; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s8
2486; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
2487; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
2488; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
2489; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
2490; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2491; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2492; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2493; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
2494; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2495; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2496; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2497; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2498; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
2499; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
2500; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s6
2501; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s7
2502; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
2503; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s10, -1
2504; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2505; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s2
2506; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s3
2507; GFX1064_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
2508; GFX1064_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
2509; GFX1064_ITERATIVE-NEXT:    buffer_gl1_inv
2510; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
2511; GFX1064_ITERATIVE-NEXT:  .LBB5_4:
2512; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
2513; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
2514; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2515; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
2516; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
2517; GFX1064_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc, s2, v1
2518; GFX1064_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
2519; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
2520; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2521; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2522; GFX1064_ITERATIVE-NEXT:    s_endpgm
2523;
2524; GFX1032_ITERATIVE-LABEL: add_i64_varying:
2525; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
2526; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
2527; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
2528; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
2529; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
2530; GFX1032_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
2531; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2532; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
2533; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
2534; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
2535; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
2536; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s1
2537; GFX1032_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
2538; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
2539; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
2540; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
2541; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
2542; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
2543; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2544; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2545; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2546; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
2547; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2548; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2549; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s4, exec_lo, s4
2550; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
2551; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
2552; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s6
2553; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s7
2554; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
2555; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s10, -1
2556; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2557; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s2
2558; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s3
2559; GFX1032_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
2560; GFX1032_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
2561; GFX1032_ITERATIVE-NEXT:    buffer_gl1_inv
2562; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
2563; GFX1032_ITERATIVE-NEXT:  .LBB5_4:
2564; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
2565; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2566; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2567; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
2568; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
2569; GFX1032_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v1
2570; GFX1032_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
2571; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
2572; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2573; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2574; GFX1032_ITERATIVE-NEXT:    s_endpgm
2575;
2576; GFX1164_ITERATIVE-LABEL: add_i64_varying:
2577; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
2578; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
2579; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
2580; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
2581; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
2582; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
2583; GFX1164_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
2584; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2585; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
2586; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
2587; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s3, v2, s2
2588; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s2
2589; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
2590; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s2
2591; GFX1164_ITERATIVE-NEXT:    s_add_u32 s6, s6, s3
2592; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2593; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s8
2594; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
2595; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
2596; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2597; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
2598; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
2599; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2600; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2601; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2602; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2603; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2604; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2605; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
2606; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2607; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2608; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2609; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
2610; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
2611; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s6
2612; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s7
2613; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
2614; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s10, -1
2615; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2616; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s8, s2
2617; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s9, s3
2618; GFX1164_ITERATIVE-NEXT:    buffer_atomic_add_u64 v[2:3], off, s[8:11], 0 glc
2619; GFX1164_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
2620; GFX1164_ITERATIVE-NEXT:    buffer_gl1_inv
2621; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
2622; GFX1164_ITERATIVE-NEXT:  .LBB5_4:
2623; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
2624; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2625; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
2626; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
2627; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2628; GFX1164_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc, s2, v0
2629; GFX1164_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2630; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
2631; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2632; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2633; GFX1164_ITERATIVE-NEXT:    s_endpgm
2634;
2635; GFX1132_ITERATIVE-LABEL: add_i64_varying:
2636; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
2637; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
2638; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
2639; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
2640; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
2641; GFX1132_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
2642; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2643; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
2644; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
2645; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
2646; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
2647; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
2648; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
2649; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2650; GFX1132_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
2651; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
2652; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
2653; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2654; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
2655; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
2656; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
2657; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2658; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2659; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2660; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2661; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2662; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
2663; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2664; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s4, exec_lo, s4
2665; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
2666; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
2667; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
2668; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
2669; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s10, -1
2670; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2671; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s8, s2
2672; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s9, s3
2673; GFX1132_ITERATIVE-NEXT:    buffer_atomic_add_u64 v[2:3], off, s[8:11], 0 glc
2674; GFX1132_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
2675; GFX1132_ITERATIVE-NEXT:    buffer_gl1_inv
2676; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
2677; GFX1132_ITERATIVE-NEXT:  .LBB5_4:
2678; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2679; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2680; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
2681; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
2682; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2683; GFX1132_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v0
2684; GFX1132_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2685; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
2686; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2687; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2688; GFX1132_ITERATIVE-NEXT:    s_endpgm
2689;
2690; GFX1264_ITERATIVE-LABEL: add_i64_varying:
2691; GFX1264_ITERATIVE:       ; %bb.0: ; %entry
2692; GFX1264_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
2693; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
2694; GFX1264_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
2695; GFX1264_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
2696; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
2697; GFX1264_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
2698; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2699; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[0:1]
2700; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
2701; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s10
2702; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s10
2703; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
2704; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s10
2705; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s10
2706; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[8:9]
2707; GFX1264_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
2708; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
2709; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
2710; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2711; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2712; GFX1264_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2713; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2714; GFX1264_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2715; GFX1264_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2716; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
2717; GFX1264_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2718; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2719; GFX1264_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2720; GFX1264_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
2721; GFX1264_ITERATIVE-NEXT:  ; %bb.3:
2722; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
2723; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s6
2724; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s7
2725; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
2726; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s10, -1
2727; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
2728; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s8, s2
2729; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s9, s3
2730; GFX1264_ITERATIVE-NEXT:    buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2731; GFX1264_ITERATIVE-NEXT:    s_wait_loadcnt 0x0
2732; GFX1264_ITERATIVE-NEXT:    global_inv scope:SCOPE_DEV
2733; GFX1264_ITERATIVE-NEXT:  .LBB5_4:
2734; GFX1264_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
2735; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
2736; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
2737; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
2738; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2739; GFX1264_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc, s2, v0
2740; GFX1264_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2741; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
2742; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2743; GFX1264_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
2744; GFX1264_ITERATIVE-NEXT:    s_endpgm
2745;
2746; GFX1232_ITERATIVE-LABEL: add_i64_varying:
2747; GFX1232_ITERATIVE:       ; %bb.0: ; %entry
2748; GFX1232_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
2749; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
2750; GFX1232_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
2751; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
2752; GFX1232_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
2753; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2754; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
2755; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
2756; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
2757; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
2758; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
2759; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
2760; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
2761; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
2762; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
2763; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
2764; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
2765; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
2766; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
2767; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2768; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2769; GFX1232_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2770; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2771; GFX1232_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2772; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
2773; GFX1232_ITERATIVE-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2774; GFX1232_ITERATIVE-NEXT:    s_xor_b32 s4, exec_lo, s4
2775; GFX1232_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
2776; GFX1232_ITERATIVE-NEXT:  ; %bb.3:
2777; GFX1232_ITERATIVE-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
2778; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
2779; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s10, -1
2780; GFX1232_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
2781; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s8, s2
2782; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s9, s3
2783; GFX1232_ITERATIVE-NEXT:    buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2784; GFX1232_ITERATIVE-NEXT:    s_wait_loadcnt 0x0
2785; GFX1232_ITERATIVE-NEXT:    global_inv scope:SCOPE_DEV
2786; GFX1232_ITERATIVE-NEXT:  .LBB5_4:
2787; GFX1232_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2788; GFX1232_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
2789; GFX1232_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
2790; GFX1232_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
2791; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2792; GFX1232_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v0
2793; GFX1232_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2794; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
2795; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2796; GFX1232_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
2797; GFX1232_ITERATIVE-NEXT:    s_endpgm
2798;
2799; GFX7LESS_DPP-LABEL: add_i64_varying:
2800; GFX7LESS_DPP:       ; %bb.0: ; %entry
2801; GFX7LESS_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2802; GFX7LESS_DPP-NEXT:    s_mov_b32 s7, 0xf000
2803; GFX7LESS_DPP-NEXT:    s_mov_b32 s6, -1
2804; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
2805; GFX7LESS_DPP-NEXT:    s_mov_b32 s10, s6
2806; GFX7LESS_DPP-NEXT:    s_mov_b32 s11, s7
2807; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2808; GFX7LESS_DPP-NEXT:    s_mov_b32 s8, s2
2809; GFX7LESS_DPP-NEXT:    s_mov_b32 s9, s3
2810; GFX7LESS_DPP-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
2811; GFX7LESS_DPP-NEXT:    s_waitcnt vmcnt(0)
2812; GFX7LESS_DPP-NEXT:    buffer_wbinvl1
2813; GFX7LESS_DPP-NEXT:    s_mov_b32 s4, s0
2814; GFX7LESS_DPP-NEXT:    s_mov_b32 s5, s1
2815; GFX7LESS_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2816; GFX7LESS_DPP-NEXT:    s_endpgm
2817;
2818; GFX8_DPP-LABEL: add_i64_varying:
2819; GFX8_DPP:       ; %bb.0: ; %entry
2820; GFX8_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2821; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
2822; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
2823; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
2824; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
2825; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v6
2826; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
2827; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[4:5]
2828; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
2829; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s[4:5]
2830; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
2831; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
2832; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
2833; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2834; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
2835; GFX8_DPP-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
2836; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
2837; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
2838; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
2839; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2840; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
2841; GFX8_DPP-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
2842; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
2843; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
2844; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
2845; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2846; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
2847; GFX8_DPP-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
2848; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
2849; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
2850; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
2851; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2852; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
2853; GFX8_DPP-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
2854; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
2855; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
2856; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
2857; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2858; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
2859; GFX8_DPP-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
2860; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
2861; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
2862; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
2863; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2864; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v2, v4, vcc
2865; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
2866; GFX8_DPP-NEXT:    v_readlane_b32 s7, v4, 63
2867; GFX8_DPP-NEXT:    v_readlane_b32 s6, v3, 63
2868; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
2869; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
2870; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
2871; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
2872; GFX8_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
2873; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2874; GFX8_DPP-NEXT:    s_cbranch_execz .LBB5_2
2875; GFX8_DPP-NEXT:  ; %bb.1:
2876; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s6
2877; GFX8_DPP-NEXT:    s_mov_b32 s11, 0xf000
2878; GFX8_DPP-NEXT:    s_mov_b32 s10, -1
2879; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2880; GFX8_DPP-NEXT:    s_mov_b32 s8, s2
2881; GFX8_DPP-NEXT:    s_mov_b32 s9, s3
2882; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s7
2883; GFX8_DPP-NEXT:    buffer_atomic_add_x2 v[6:7], off, s[8:11], 0 glc
2884; GFX8_DPP-NEXT:    s_waitcnt vmcnt(0)
2885; GFX8_DPP-NEXT:    buffer_wbinvl1_vol
2886; GFX8_DPP-NEXT:  .LBB5_2:
2887; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
2888; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v7
2889; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v6
2890; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v1
2891; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v2
2892; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
2893; GFX8_DPP-NEXT:    v_add_u32_e32 v6, vcc, s5, v6
2894; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2895; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
2896; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
2897; GFX8_DPP-NEXT:    v_addc_u32_e32 v7, vcc, v0, v7, vcc
2898; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
2899; GFX8_DPP-NEXT:    s_endpgm
2900;
2901; GFX9_DPP-LABEL: add_i64_varying:
2902; GFX9_DPP:       ; %bb.0: ; %entry
2903; GFX9_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2904; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
2905; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
2906; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
2907; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
2908; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v6
2909; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
2910; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[4:5]
2911; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
2912; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s[4:5]
2913; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
2914; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
2915; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
2916; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2917; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
2918; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
2919; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
2920; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
2921; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
2922; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2923; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
2924; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
2925; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
2926; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
2927; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
2928; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2929; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
2930; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
2931; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
2932; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
2933; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
2934; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2935; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
2936; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
2937; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
2938; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
2939; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
2940; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2941; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
2942; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
2943; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
2944; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
2945; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
2946; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2947; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
2948; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
2949; GFX9_DPP-NEXT:    v_readlane_b32 s7, v4, 63
2950; GFX9_DPP-NEXT:    v_readlane_b32 s6, v3, 63
2951; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
2952; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
2953; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
2954; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
2955; GFX9_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
2956; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2957; GFX9_DPP-NEXT:    s_cbranch_execz .LBB5_2
2958; GFX9_DPP-NEXT:  ; %bb.1:
2959; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s6
2960; GFX9_DPP-NEXT:    s_mov_b32 s11, 0xf000
2961; GFX9_DPP-NEXT:    s_mov_b32 s10, -1
2962; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2963; GFX9_DPP-NEXT:    s_mov_b32 s8, s2
2964; GFX9_DPP-NEXT:    s_mov_b32 s9, s3
2965; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s7
2966; GFX9_DPP-NEXT:    buffer_atomic_add_x2 v[6:7], off, s[8:11], 0 glc
2967; GFX9_DPP-NEXT:    s_waitcnt vmcnt(0)
2968; GFX9_DPP-NEXT:    buffer_wbinvl1_vol
2969; GFX9_DPP-NEXT:  .LBB5_2:
2970; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
2971; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v7
2972; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v6
2973; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v1
2974; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v2
2975; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
2976; GFX9_DPP-NEXT:    v_add_co_u32_e32 v6, vcc, s5, v6
2977; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2978; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
2979; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
2980; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v7, vcc, v0, v7, vcc
2981; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
2982; GFX9_DPP-NEXT:    s_endpgm
2983;
2984; GFX1064_DPP-LABEL: add_i64_varying:
2985; GFX1064_DPP:       ; %bb.0: ; %entry
2986; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
2987; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
2988; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
2989; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
2990; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
2991; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
2992; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
2993; GFX1064_DPP-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2994; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, 0
2995; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
2996; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v2, v1
2997; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc
2998; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
2999; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3000; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
3001; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3002; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v6
3003; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
3004; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
3005; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3006; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
3007; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3008; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v4
3009; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc
3010; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
3011; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3012; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3013; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v6
3014; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
3015; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
3016; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
3017; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v2, -1, -1
3018; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3019; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3020; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v3
3021; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
3022; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
3023; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
3024; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
3025; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 31
3026; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, s2
3027; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s3
3028; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3029; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3030; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
3031; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v3
3032; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
3033; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
3034; GFX1064_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3035; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3036; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
3037; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3038; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3039; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 15
3040; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 15
3041; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v2, 31
3042; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v1, 31
3043; GFX1064_DPP-NEXT:    v_readlane_b32 s10, v1, 47
3044; GFX1064_DPP-NEXT:    v_writelane_b32 v7, s6, 16
3045; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s7, 16
3046; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 63
3047; GFX1064_DPP-NEXT:    v_readlane_b32 s11, v2, 47
3048; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v2, 63
3049; GFX1064_DPP-NEXT:    v_writelane_b32 v7, s8, 32
3050; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s9, 32
3051; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
3052; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3053; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
3054; GFX1064_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
3055; GFX1064_DPP-NEXT:    v_writelane_b32 v7, s11, 48
3056; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s10, 48
3057; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[8:9]
3058; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3059; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
3060; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
3061; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
3062; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB5_2
3063; GFX1064_DPP-NEXT:  ; %bb.1:
3064; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s5
3065; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s4
3066; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
3067; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3068; GFX1064_DPP-NEXT:    s_mov_b32 s4, s2
3069; GFX1064_DPP-NEXT:    s_mov_b32 s5, s3
3070; GFX1064_DPP-NEXT:    buffer_atomic_add_x2 v[8:9], off, s[4:7], 0 glc
3071; GFX1064_DPP-NEXT:    s_waitcnt vmcnt(0)
3072; GFX1064_DPP-NEXT:    buffer_gl1_inv
3073; GFX1064_DPP-NEXT:    buffer_gl0_inv
3074; GFX1064_DPP-NEXT:  .LBB5_2:
3075; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
3076; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
3077; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3078; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s2, v8
3079; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v6
3080; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v7
3081; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v9
3082; GFX1064_DPP-NEXT:    v_add_co_u32 v8, vcc, s2, v10
3083; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
3084; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc
3085; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
3086; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0
3087; GFX1064_DPP-NEXT:    s_endpgm
3088;
3089; GFX1032_DPP-LABEL: add_i64_varying:
3090; GFX1032_DPP:       ; %bb.0: ; %entry
3091; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
3092; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
3093; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s0
3094; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s0
3095; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
3096; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
3097; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
3098; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3099; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, 0
3100; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
3101; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, 0
3102; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v1
3103; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo
3104; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
3105; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3106; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
3107; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3108; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v6
3109; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
3110; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
3111; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3112; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
3113; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3114; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v4
3115; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo
3116; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
3117; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3118; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3119; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v6
3120; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
3121; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
3122; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
3123; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v2, -1, -1
3124; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3125; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3126; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
3127; GFX1032_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3128; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
3129; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
3130; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
3131; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
3132; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3133; GFX1032_DPP-NEXT:    v_readlane_b32 s8, v2, 15
3134; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v2, 31
3135; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3136; GFX1032_DPP-NEXT:    v_readlane_b32 s7, v1, 15
3137; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
3138; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3139; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
3140; GFX1032_DPP-NEXT:    v_writelane_b32 v8, s8, 16
3141; GFX1032_DPP-NEXT:    v_writelane_b32 v7, s7, 16
3142; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
3143; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
3144; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3145; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
3146; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
3147; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB5_2
3148; GFX1032_DPP-NEXT:  ; %bb.1:
3149; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s5
3150; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s4
3151; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
3152; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3153; GFX1032_DPP-NEXT:    s_mov_b32 s4, s2
3154; GFX1032_DPP-NEXT:    s_mov_b32 s5, s3
3155; GFX1032_DPP-NEXT:    buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc
3156; GFX1032_DPP-NEXT:    s_waitcnt vmcnt(0)
3157; GFX1032_DPP-NEXT:    buffer_gl1_inv
3158; GFX1032_DPP-NEXT:    buffer_gl0_inv
3159; GFX1032_DPP-NEXT:  .LBB5_2:
3160; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
3161; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
3162; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3163; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s2, v9
3164; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v7
3165; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v8
3166; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v10
3167; GFX1032_DPP-NEXT:    v_add_co_u32 v9, vcc_lo, s2, v11
3168; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
3169; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo
3170; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
3171; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
3172; GFX1032_DPP-NEXT:    s_endpgm
3173;
3174; GFX1164_DPP-LABEL: add_i64_varying:
3175; GFX1164_DPP:       ; %bb.0: ; %entry
3176; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3177; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
3178; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3179; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
3180; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
3181; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
3182; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
3183; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3184; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3185; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3186; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3187; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
3188; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3189; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
3190; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3191; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3192; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
3193; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3194; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3195; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
3196; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3197; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
3198; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
3199; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3200; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
3201; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3202; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3203; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
3204; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3205; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
3206; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3207; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
3208; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3209; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
3210; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3211; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3212; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
3213; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
3214; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
3215; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3216; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s2
3217; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 31
3218; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3219; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3220; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3221; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
3222; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
3223; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
3224; GFX1164_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3225; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3226; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
3227; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3228; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 15
3229; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3230; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 15
3231; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v2, 31
3232; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v1, 31
3233; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s6, 16
3234; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 63
3235; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s7, 16
3236; GFX1164_DPP-NEXT:    v_readlane_b32 s10, v2, 47
3237; GFX1164_DPP-NEXT:    v_readlane_b32 s11, v1, 47
3238; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 63
3239; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s8, 32
3240; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s9, 32
3241; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
3242; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3243; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3244; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
3245; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
3246; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s10, 48
3247; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s11, 48
3248; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[8:9]
3249; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
3250; GFX1164_DPP-NEXT:    s_mov_b64 s[8:9], exec
3251; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
3252; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
3253; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB5_2
3254; GFX1164_DPP-NEXT:  ; %bb.1:
3255; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s5
3256; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s4
3257; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
3258; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3259; GFX1164_DPP-NEXT:    s_mov_b32 s4, s2
3260; GFX1164_DPP-NEXT:    s_mov_b32 s5, s3
3261; GFX1164_DPP-NEXT:    buffer_atomic_add_u64 v[6:7], off, s[4:7], 0 glc
3262; GFX1164_DPP-NEXT:    s_waitcnt vmcnt(0)
3263; GFX1164_DPP-NEXT:    buffer_gl1_inv
3264; GFX1164_DPP-NEXT:    buffer_gl0_inv
3265; GFX1164_DPP-NEXT:  .LBB5_2:
3266; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
3267; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3268; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s2, v6
3269; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v4
3270; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v5
3271; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v7
3272; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3273; GFX1164_DPP-NEXT:    v_add_co_u32 v6, vcc, s2, v8
3274; GFX1164_DPP-NEXT:    s_mov_b32 s2, s6
3275; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc
3276; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
3277; GFX1164_DPP-NEXT:    buffer_store_b64 v[6:7], off, s[0:3], 0
3278; GFX1164_DPP-NEXT:    s_endpgm
3279;
3280; GFX1132_DPP-LABEL: add_i64_varying:
3281; GFX1132_DPP:       ; %bb.0: ; %entry
3282; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3283; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
3284; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3285; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s0
3286; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
3287; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s0
3288; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0
3289; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, 0
3290; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
3291; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3292; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3293; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3294; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
3295; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3296; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
3297; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3298; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3299; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
3300; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3301; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3302; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
3303; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3304; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
3305; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
3306; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3307; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
3308; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3309; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3310; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
3311; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3312; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
3313; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3314; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3315; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
3316; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3317; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
3318; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3319; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
3320; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
3321; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
3322; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v2, 31
3323; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3324; GFX1132_DPP-NEXT:    v_readlane_b32 s7, v2, 15
3325; GFX1132_DPP-NEXT:    v_readlane_b32 s8, v1, 15
3326; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 31
3327; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3328; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
3329; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3330; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3331; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
3332; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s7, 16
3333; GFX1132_DPP-NEXT:    v_writelane_b32 v7, s8, 16
3334; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
3335; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
3336; GFX1132_DPP-NEXT:    s_mov_b32 s8, exec_lo
3337; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
3338; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
3339; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB5_2
3340; GFX1132_DPP-NEXT:  ; %bb.1:
3341; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
3342; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
3343; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3344; GFX1132_DPP-NEXT:    s_mov_b32 s4, s2
3345; GFX1132_DPP-NEXT:    s_mov_b32 s5, s3
3346; GFX1132_DPP-NEXT:    buffer_atomic_add_u64 v[8:9], off, s[4:7], 0 glc
3347; GFX1132_DPP-NEXT:    s_waitcnt vmcnt(0)
3348; GFX1132_DPP-NEXT:    buffer_gl1_inv
3349; GFX1132_DPP-NEXT:    buffer_gl0_inv
3350; GFX1132_DPP-NEXT:  .LBB5_2:
3351; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
3352; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3353; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s2, v8
3354; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v6
3355; GFX1132_DPP-NEXT:    v_mov_b32_e32 v11, v7
3356; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v9
3357; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3358; GFX1132_DPP-NEXT:    v_add_co_u32 v8, vcc_lo, s2, v10
3359; GFX1132_DPP-NEXT:    s_mov_b32 s2, s6
3360; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
3361; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
3362; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
3363; GFX1132_DPP-NEXT:    s_endpgm
3364;
3365; GFX1264_DPP-LABEL: add_i64_varying:
3366; GFX1264_DPP:       ; %bb.0: ; %entry
3367; GFX1264_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3368; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
3369; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3370; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
3371; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, 0
3372; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
3373; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, 0
3374; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3375; GFX1264_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3376; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3377; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3378; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
3379; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3380; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, 0
3381; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3382; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3383; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
3384; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3385; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3386; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, 0
3387; GFX1264_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3388; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
3389; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
3390; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3391; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, 0
3392; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3393; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3394; GFX1264_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
3395; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3396; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
3397; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3398; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, 0
3399; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3400; GFX1264_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
3401; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3402; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3403; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
3404; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, 0
3405; GFX1264_DPP-NEXT:    v_readlane_b32 s2, v1, 31
3406; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3407; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, s2
3408; GFX1264_DPP-NEXT:    v_readlane_b32 s2, v2, 31
3409; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3410; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3411; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
3412; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, 0
3413; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
3414; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
3415; GFX1264_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3416; GFX1264_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3417; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
3418; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3419; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v2, 15
3420; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3421; GFX1264_DPP-NEXT:    v_readlane_b32 s7, v1, 15
3422; GFX1264_DPP-NEXT:    v_readlane_b32 s8, v2, 31
3423; GFX1264_DPP-NEXT:    v_readlane_b32 s9, v1, 31
3424; GFX1264_DPP-NEXT:    v_writelane_b32 v4, s6, 16
3425; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v2, 63
3426; GFX1264_DPP-NEXT:    v_writelane_b32 v5, s7, 16
3427; GFX1264_DPP-NEXT:    v_readlane_b32 s10, v2, 47
3428; GFX1264_DPP-NEXT:    v_readlane_b32 s11, v1, 47
3429; GFX1264_DPP-NEXT:    v_readlane_b32 s7, v1, 63
3430; GFX1264_DPP-NEXT:    v_writelane_b32 v4, s8, 32
3431; GFX1264_DPP-NEXT:    v_writelane_b32 v5, s9, 32
3432; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[4:5]
3433; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3434; GFX1264_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3435; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
3436; GFX1264_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
3437; GFX1264_DPP-NEXT:    v_writelane_b32 v4, s10, 48
3438; GFX1264_DPP-NEXT:    v_writelane_b32 v5, s11, 48
3439; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
3440; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[8:9]
3441; GFX1264_DPP-NEXT:    s_mov_b32 s6, -1
3442; GFX1264_DPP-NEXT:    s_mov_b64 s[8:9], exec
3443; GFX1264_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
3444; GFX1264_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
3445; GFX1264_DPP-NEXT:    s_cbranch_execz .LBB5_2
3446; GFX1264_DPP-NEXT:  ; %bb.1:
3447; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, s5
3448; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, s4
3449; GFX1264_DPP-NEXT:    s_mov_b32 s7, 0x31016000
3450; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
3451; GFX1264_DPP-NEXT:    s_mov_b32 s4, s2
3452; GFX1264_DPP-NEXT:    s_mov_b32 s5, s3
3453; GFX1264_DPP-NEXT:    buffer_atomic_add_u64 v[6:7], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3454; GFX1264_DPP-NEXT:    s_wait_loadcnt 0x0
3455; GFX1264_DPP-NEXT:    global_inv scope:SCOPE_DEV
3456; GFX1264_DPP-NEXT:  .LBB5_2:
3457; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
3458; GFX1264_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
3459; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
3460; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s2, v6
3461; GFX1264_DPP-NEXT:    v_mov_b32_e32 v8, v4
3462; GFX1264_DPP-NEXT:    v_mov_b32_e32 v9, v5
3463; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s3, v7
3464; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3465; GFX1264_DPP-NEXT:    v_add_co_u32 v6, vcc, s2, v8
3466; GFX1264_DPP-NEXT:    s_mov_b32 s2, s6
3467; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc
3468; GFX1264_DPP-NEXT:    s_mov_b32 s3, 0x31016000
3469; GFX1264_DPP-NEXT:    buffer_store_b64 v[6:7], off, s[0:3], null
3470; GFX1264_DPP-NEXT:    s_endpgm
3471;
3472; GFX1232_DPP-LABEL: add_i64_varying:
3473; GFX1232_DPP:       ; %bb.0: ; %entry
3474; GFX1232_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3475; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
3476; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3477; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s0
3478; GFX1232_DPP-NEXT:    v_mov_b32_e32 v2, 0
3479; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s0
3480; GFX1232_DPP-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0
3481; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, 0
3482; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
3483; GFX1232_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3484; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3485; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3486; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
3487; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3488; GFX1232_DPP-NEXT:    v_mov_b32_e32 v2, 0
3489; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3490; GFX1232_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3491; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
3492; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3493; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3494; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, 0
3495; GFX1232_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3496; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
3497; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
3498; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3499; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, 0
3500; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3501; GFX1232_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3502; GFX1232_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
3503; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3504; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
3505; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3506; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3507; GFX1232_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
3508; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
3509; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
3510; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3511; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s6, -1
3512; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
3513; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
3514; GFX1232_DPP-NEXT:    v_readlane_b32 s4, v2, 31
3515; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3516; GFX1232_DPP-NEXT:    v_readlane_b32 s7, v2, 15
3517; GFX1232_DPP-NEXT:    v_readlane_b32 s8, v1, 15
3518; GFX1232_DPP-NEXT:    v_readlane_b32 s5, v1, 31
3519; GFX1232_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3520; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s6
3521; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3522; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3523; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s6, -1
3524; GFX1232_DPP-NEXT:    v_writelane_b32 v6, s7, 16
3525; GFX1232_DPP-NEXT:    v_writelane_b32 v7, s8, 16
3526; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
3527; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s6
3528; GFX1232_DPP-NEXT:    s_mov_b32 s6, -1
3529; GFX1232_DPP-NEXT:    s_mov_b32 s8, exec_lo
3530; GFX1232_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
3531; GFX1232_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
3532; GFX1232_DPP-NEXT:    s_cbranch_execz .LBB5_2
3533; GFX1232_DPP-NEXT:  ; %bb.1:
3534; GFX1232_DPP-NEXT:    v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
3535; GFX1232_DPP-NEXT:    s_mov_b32 s7, 0x31016000
3536; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
3537; GFX1232_DPP-NEXT:    s_mov_b32 s4, s2
3538; GFX1232_DPP-NEXT:    s_mov_b32 s5, s3
3539; GFX1232_DPP-NEXT:    buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3540; GFX1232_DPP-NEXT:    s_wait_loadcnt 0x0
3541; GFX1232_DPP-NEXT:    global_inv scope:SCOPE_DEV
3542; GFX1232_DPP-NEXT:  .LBB5_2:
3543; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
3544; GFX1232_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
3545; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
3546; GFX1232_DPP-NEXT:    v_readfirstlane_b32 s2, v8
3547; GFX1232_DPP-NEXT:    v_mov_b32_e32 v10, v6
3548; GFX1232_DPP-NEXT:    v_mov_b32_e32 v11, v7
3549; GFX1232_DPP-NEXT:    v_readfirstlane_b32 s3, v9
3550; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3551; GFX1232_DPP-NEXT:    v_add_co_u32 v8, vcc_lo, s2, v10
3552; GFX1232_DPP-NEXT:    s_mov_b32 s2, s6
3553; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
3554; GFX1232_DPP-NEXT:    s_mov_b32 s3, 0x31016000
3555; GFX1232_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], null
3556; GFX1232_DPP-NEXT:    s_endpgm
3557entry:
3558  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3559  %zext = zext i32 %lane to i64
3560  %old = atomicrmw add ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel
3561  store i64 %old, ptr addrspace(1) %out
3562  ret void
3563}
3564
3565define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
3566; GFX7LESS-LABEL: sub_i32_constant:
3567; GFX7LESS:       ; %bb.0: ; %entry
3568; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
3569; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3570; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
3571; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
3572; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3573; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
3574; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3575; GFX7LESS-NEXT:    s_cbranch_execz .LBB6_2
3576; GFX7LESS-NEXT:  ; %bb.1:
3577; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
3578; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3579; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
3580; GFX7LESS-NEXT:    s_mov_b32 s10, -1
3581; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3582; GFX7LESS-NEXT:    s_mov_b32 s8, s2
3583; GFX7LESS-NEXT:    s_mov_b32 s9, s3
3584; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
3585; GFX7LESS-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
3586; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
3587; GFX7LESS-NEXT:    buffer_wbinvl1
3588; GFX7LESS-NEXT:  .LBB6_2:
3589; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
3590; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3591; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3592; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3593; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
3594; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3595; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
3596; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3597; GFX7LESS-NEXT:    s_endpgm
3598;
3599; GFX8-LABEL: sub_i32_constant:
3600; GFX8:       ; %bb.0: ; %entry
3601; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3602; GFX8-NEXT:    s_mov_b64 s[6:7], exec
3603; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3604; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
3605; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3606; GFX8-NEXT:    ; implicit-def: $vgpr1
3607; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3608; GFX8-NEXT:    s_cbranch_execz .LBB6_2
3609; GFX8-NEXT:  ; %bb.1:
3610; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3611; GFX8-NEXT:    s_mov_b32 s8, s2
3612; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
3613; GFX8-NEXT:    s_mul_i32 s2, s2, 5
3614; GFX8-NEXT:    s_mov_b32 s11, 0xf000
3615; GFX8-NEXT:    s_mov_b32 s10, -1
3616; GFX8-NEXT:    s_mov_b32 s9, s3
3617; GFX8-NEXT:    v_mov_b32_e32 v1, s2
3618; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
3619; GFX8-NEXT:    s_waitcnt vmcnt(0)
3620; GFX8-NEXT:    buffer_wbinvl1_vol
3621; GFX8-NEXT:  .LBB6_2:
3622; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3623; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
3624; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3625; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3626; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3627; GFX8-NEXT:    s_mov_b32 s2, -1
3628; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
3629; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3630; GFX8-NEXT:    s_endpgm
3631;
3632; GFX9-LABEL: sub_i32_constant:
3633; GFX9:       ; %bb.0: ; %entry
3634; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3635; GFX9-NEXT:    s_mov_b64 s[6:7], exec
3636; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3637; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
3638; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3639; GFX9-NEXT:    ; implicit-def: $vgpr1
3640; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3641; GFX9-NEXT:    s_cbranch_execz .LBB6_2
3642; GFX9-NEXT:  ; %bb.1:
3643; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3644; GFX9-NEXT:    s_mov_b32 s8, s2
3645; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
3646; GFX9-NEXT:    s_mul_i32 s2, s2, 5
3647; GFX9-NEXT:    s_mov_b32 s11, 0xf000
3648; GFX9-NEXT:    s_mov_b32 s10, -1
3649; GFX9-NEXT:    s_mov_b32 s9, s3
3650; GFX9-NEXT:    v_mov_b32_e32 v1, s2
3651; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
3652; GFX9-NEXT:    s_waitcnt vmcnt(0)
3653; GFX9-NEXT:    buffer_wbinvl1_vol
3654; GFX9-NEXT:  .LBB6_2:
3655; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3656; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
3657; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3658; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3659; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3660; GFX9-NEXT:    s_mov_b32 s2, -1
3661; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
3662; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3663; GFX9-NEXT:    s_endpgm
3664;
3665; GFX1064-LABEL: sub_i32_constant:
3666; GFX1064:       ; %bb.0: ; %entry
3667; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3668; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
3669; GFX1064-NEXT:    ; implicit-def: $vgpr1
3670; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3671; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
3672; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3673; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3674; GFX1064-NEXT:    s_cbranch_execz .LBB6_2
3675; GFX1064-NEXT:  ; %bb.1:
3676; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3677; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
3678; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
3679; GFX1064-NEXT:    s_mov_b32 s10, -1
3680; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
3681; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3682; GFX1064-NEXT:    s_mov_b32 s8, s2
3683; GFX1064-NEXT:    s_mov_b32 s9, s3
3684; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
3685; GFX1064-NEXT:    s_waitcnt vmcnt(0)
3686; GFX1064-NEXT:    buffer_gl1_inv
3687; GFX1064-NEXT:    buffer_gl0_inv
3688; GFX1064-NEXT:  .LBB6_2:
3689; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3690; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
3691; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3692; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
3693; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3694; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3695; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
3696; GFX1064-NEXT:    s_mov_b32 s2, -1
3697; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3698; GFX1064-NEXT:    s_endpgm
3699;
3700; GFX1032-LABEL: sub_i32_constant:
3701; GFX1032:       ; %bb.0: ; %entry
3702; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3703; GFX1032-NEXT:    s_mov_b32 s6, exec_lo
3704; GFX1032-NEXT:    ; implicit-def: $vgpr1
3705; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3706; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3707; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3708; GFX1032-NEXT:    s_cbranch_execz .LBB6_2
3709; GFX1032-NEXT:  ; %bb.1:
3710; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s6
3711; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
3712; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
3713; GFX1032-NEXT:    s_mov_b32 s10, -1
3714; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
3715; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3716; GFX1032-NEXT:    s_mov_b32 s8, s2
3717; GFX1032-NEXT:    s_mov_b32 s9, s3
3718; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
3719; GFX1032-NEXT:    s_waitcnt vmcnt(0)
3720; GFX1032-NEXT:    buffer_gl1_inv
3721; GFX1032-NEXT:    buffer_gl0_inv
3722; GFX1032-NEXT:  .LBB6_2:
3723; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3724; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3725; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3726; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
3727; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3728; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3729; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
3730; GFX1032-NEXT:    s_mov_b32 s2, -1
3731; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3732; GFX1032-NEXT:    s_endpgm
3733;
3734; GFX1164-LABEL: sub_i32_constant:
3735; GFX1164:       ; %bb.0: ; %entry
3736; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3737; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
3738; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
3739; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3740; GFX1164-NEXT:    ; implicit-def: $vgpr1
3741; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3742; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
3743; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
3744; GFX1164-NEXT:    s_cbranch_execz .LBB6_2
3745; GFX1164-NEXT:  ; %bb.1:
3746; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3747; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
3748; GFX1164-NEXT:    s_mul_i32 s6, s6, 5
3749; GFX1164-NEXT:    s_mov_b32 s10, -1
3750; GFX1164-NEXT:    v_mov_b32_e32 v1, s6
3751; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3752; GFX1164-NEXT:    s_mov_b32 s8, s2
3753; GFX1164-NEXT:    s_mov_b32 s9, s3
3754; GFX1164-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
3755; GFX1164-NEXT:    s_waitcnt vmcnt(0)
3756; GFX1164-NEXT:    buffer_gl1_inv
3757; GFX1164-NEXT:    buffer_gl0_inv
3758; GFX1164-NEXT:  .LBB6_2:
3759; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
3760; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3761; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
3762; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3763; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3764; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3765; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
3766; GFX1164-NEXT:    s_mov_b32 s2, -1
3767; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3768; GFX1164-NEXT:    s_endpgm
3769;
3770; GFX1132-LABEL: sub_i32_constant:
3771; GFX1132:       ; %bb.0: ; %entry
3772; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3773; GFX1132-NEXT:    s_mov_b32 s6, exec_lo
3774; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
3775; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3776; GFX1132-NEXT:    ; implicit-def: $vgpr1
3777; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3778; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
3779; GFX1132-NEXT:    s_cbranch_execz .LBB6_2
3780; GFX1132-NEXT:  ; %bb.1:
3781; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s6
3782; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
3783; GFX1132-NEXT:    s_mul_i32 s5, s5, 5
3784; GFX1132-NEXT:    s_mov_b32 s10, -1
3785; GFX1132-NEXT:    v_mov_b32_e32 v1, s5
3786; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3787; GFX1132-NEXT:    s_mov_b32 s8, s2
3788; GFX1132-NEXT:    s_mov_b32 s9, s3
3789; GFX1132-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
3790; GFX1132-NEXT:    s_waitcnt vmcnt(0)
3791; GFX1132-NEXT:    buffer_gl1_inv
3792; GFX1132-NEXT:    buffer_gl0_inv
3793; GFX1132-NEXT:  .LBB6_2:
3794; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3795; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3796; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
3797; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3798; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3799; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3800; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
3801; GFX1132-NEXT:    s_mov_b32 s2, -1
3802; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3803; GFX1132-NEXT:    s_endpgm
3804;
3805; GFX1264-LABEL: sub_i32_constant:
3806; GFX1264:       ; %bb.0: ; %entry
3807; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3808; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
3809; GFX1264-NEXT:    s_mov_b64 s[4:5], exec
3810; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3811; GFX1264-NEXT:    ; implicit-def: $vgpr1
3812; GFX1264-NEXT:    s_wait_alu 0xfffe
3813; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3814; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
3815; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v0
3816; GFX1264-NEXT:    s_cbranch_execz .LBB6_2
3817; GFX1264-NEXT:  ; %bb.1:
3818; GFX1264-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3819; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
3820; GFX1264-NEXT:    s_wait_alu 0xfffe
3821; GFX1264-NEXT:    s_mul_i32 s6, s6, 5
3822; GFX1264-NEXT:    s_mov_b32 s10, -1
3823; GFX1264-NEXT:    s_wait_alu 0xfffe
3824; GFX1264-NEXT:    v_mov_b32_e32 v1, s6
3825; GFX1264-NEXT:    s_wait_kmcnt 0x0
3826; GFX1264-NEXT:    s_mov_b32 s8, s2
3827; GFX1264-NEXT:    s_mov_b32 s9, s3
3828; GFX1264-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3829; GFX1264-NEXT:    s_wait_loadcnt 0x0
3830; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
3831; GFX1264-NEXT:  .LBB6_2:
3832; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
3833; GFX1264-NEXT:    s_wait_kmcnt 0x0
3834; GFX1264-NEXT:    v_readfirstlane_b32 s2, v1
3835; GFX1264-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3836; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
3837; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3838; GFX1264-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
3839; GFX1264-NEXT:    s_mov_b32 s2, -1
3840; GFX1264-NEXT:    buffer_store_b32 v0, off, s[0:3], null
3841; GFX1264-NEXT:    s_endpgm
3842;
3843; GFX1232-LABEL: sub_i32_constant:
3844; GFX1232:       ; %bb.0: ; %entry
3845; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3846; GFX1232-NEXT:    s_mov_b32 s6, exec_lo
3847; GFX1232-NEXT:    s_mov_b32 s4, exec_lo
3848; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3849; GFX1232-NEXT:    ; implicit-def: $vgpr1
3850; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3851; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v0
3852; GFX1232-NEXT:    s_cbranch_execz .LBB6_2
3853; GFX1232-NEXT:  ; %bb.1:
3854; GFX1232-NEXT:    s_bcnt1_i32_b32 s5, s6
3855; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
3856; GFX1232-NEXT:    s_mul_i32 s5, s5, 5
3857; GFX1232-NEXT:    s_mov_b32 s10, -1
3858; GFX1232-NEXT:    v_mov_b32_e32 v1, s5
3859; GFX1232-NEXT:    s_wait_kmcnt 0x0
3860; GFX1232-NEXT:    s_mov_b32 s8, s2
3861; GFX1232-NEXT:    s_mov_b32 s9, s3
3862; GFX1232-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3863; GFX1232-NEXT:    s_wait_loadcnt 0x0
3864; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
3865; GFX1232-NEXT:  .LBB6_2:
3866; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3867; GFX1232-NEXT:    s_wait_kmcnt 0x0
3868; GFX1232-NEXT:    v_readfirstlane_b32 s2, v1
3869; GFX1232-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3870; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
3871; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3872; GFX1232-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
3873; GFX1232-NEXT:    s_mov_b32 s2, -1
3874; GFX1232-NEXT:    buffer_store_b32 v0, off, s[0:3], null
3875; GFX1232-NEXT:    s_endpgm
3876entry:
3877  %old = atomicrmw sub ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel
3878  store i32 %old, ptr addrspace(1) %out
3879  ret void
3880}
3881
3882define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %subitive) {
3883; GFX7LESS-LABEL: sub_i32_uniform:
3884; GFX7LESS:       ; %bb.0: ; %entry
3885; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
3886; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3887; GFX7LESS-NEXT:    s_load_dword s8, s[4:5], 0xd
3888; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
3889; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
3890; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3891; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
3892; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3893; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_2
3894; GFX7LESS-NEXT:  ; %bb.1:
3895; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
3896; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3897; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3898; GFX7LESS-NEXT:    s_mul_i32 s6, s8, s6
3899; GFX7LESS-NEXT:    s_mov_b32 s14, -1
3900; GFX7LESS-NEXT:    s_mov_b32 s12, s2
3901; GFX7LESS-NEXT:    s_mov_b32 s13, s3
3902; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
3903; GFX7LESS-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
3904; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
3905; GFX7LESS-NEXT:    buffer_wbinvl1
3906; GFX7LESS-NEXT:  .LBB7_2:
3907; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
3908; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3909; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3910; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3911; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
3912; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s8, v0
3913; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
3914; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3915; GFX7LESS-NEXT:    s_endpgm
3916;
3917; GFX8-LABEL: sub_i32_uniform:
3918; GFX8:       ; %bb.0: ; %entry
3919; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3920; GFX8-NEXT:    s_load_dword s8, s[4:5], 0x34
3921; GFX8-NEXT:    s_mov_b64 s[6:7], exec
3922; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3923; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
3924; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3925; GFX8-NEXT:    ; implicit-def: $vgpr1
3926; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3927; GFX8-NEXT:    s_cbranch_execz .LBB7_2
3928; GFX8-NEXT:  ; %bb.1:
3929; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3930; GFX8-NEXT:    s_mov_b32 s12, s2
3931; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
3932; GFX8-NEXT:    s_mul_i32 s2, s8, s2
3933; GFX8-NEXT:    s_mov_b32 s15, 0xf000
3934; GFX8-NEXT:    s_mov_b32 s14, -1
3935; GFX8-NEXT:    s_mov_b32 s13, s3
3936; GFX8-NEXT:    v_mov_b32_e32 v1, s2
3937; GFX8-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
3938; GFX8-NEXT:    s_waitcnt vmcnt(0)
3939; GFX8-NEXT:    buffer_wbinvl1_vol
3940; GFX8-NEXT:  .LBB7_2:
3941; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3942; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3943; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
3944; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
3945; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3946; GFX8-NEXT:    s_mov_b32 s2, -1
3947; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
3948; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3949; GFX8-NEXT:    s_endpgm
3950;
3951; GFX9-LABEL: sub_i32_uniform:
3952; GFX9:       ; %bb.0: ; %entry
3953; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3954; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x34
3955; GFX9-NEXT:    s_mov_b64 s[6:7], exec
3956; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3957; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
3958; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3959; GFX9-NEXT:    ; implicit-def: $vgpr1
3960; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3961; GFX9-NEXT:    s_cbranch_execz .LBB7_2
3962; GFX9-NEXT:  ; %bb.1:
3963; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3964; GFX9-NEXT:    s_mov_b32 s12, s2
3965; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
3966; GFX9-NEXT:    s_mul_i32 s2, s8, s2
3967; GFX9-NEXT:    s_mov_b32 s15, 0xf000
3968; GFX9-NEXT:    s_mov_b32 s14, -1
3969; GFX9-NEXT:    s_mov_b32 s13, s3
3970; GFX9-NEXT:    v_mov_b32_e32 v1, s2
3971; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
3972; GFX9-NEXT:    s_waitcnt vmcnt(0)
3973; GFX9-NEXT:    buffer_wbinvl1_vol
3974; GFX9-NEXT:  .LBB7_2:
3975; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3976; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3977; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
3978; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
3979; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3980; GFX9-NEXT:    s_mov_b32 s2, -1
3981; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
3982; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3983; GFX9-NEXT:    s_endpgm
3984;
3985; GFX1064-LABEL: sub_i32_uniform:
3986; GFX1064:       ; %bb.0: ; %entry
3987; GFX1064-NEXT:    s_clause 0x1
3988; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3989; GFX1064-NEXT:    s_load_dword s8, s[4:5], 0x34
3990; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
3991; GFX1064-NEXT:    ; implicit-def: $vgpr1
3992; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
3993; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
3994; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3995; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3996; GFX1064-NEXT:    s_cbranch_execz .LBB7_2
3997; GFX1064-NEXT:  ; %bb.1:
3998; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
3999; GFX1064-NEXT:    s_mov_b32 s15, 0x31016000
4000; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4001; GFX1064-NEXT:    s_mul_i32 s6, s8, s6
4002; GFX1064-NEXT:    s_mov_b32 s14, -1
4003; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
4004; GFX1064-NEXT:    s_mov_b32 s12, s2
4005; GFX1064-NEXT:    s_mov_b32 s13, s3
4006; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
4007; GFX1064-NEXT:    s_waitcnt vmcnt(0)
4008; GFX1064-NEXT:    buffer_gl1_inv
4009; GFX1064-NEXT:    buffer_gl0_inv
4010; GFX1064-NEXT:  .LBB7_2:
4011; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
4012; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
4013; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
4014; GFX1064-NEXT:    v_mul_lo_u32 v0, s8, v0
4015; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
4016; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
4017; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4018; GFX1064-NEXT:    s_mov_b32 s2, -1
4019; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4020; GFX1064-NEXT:    s_endpgm
4021;
4022; GFX1032-LABEL: sub_i32_uniform:
4023; GFX1032:       ; %bb.0: ; %entry
4024; GFX1032-NEXT:    s_clause 0x1
4025; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4026; GFX1032-NEXT:    s_load_dword s6, s[4:5], 0x34
4027; GFX1032-NEXT:    s_mov_b32 s7, exec_lo
4028; GFX1032-NEXT:    ; implicit-def: $vgpr1
4029; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s7, 0
4030; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4031; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4032; GFX1032-NEXT:    s_cbranch_execz .LBB7_2
4033; GFX1032-NEXT:  ; %bb.1:
4034; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s7
4035; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
4036; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4037; GFX1032-NEXT:    s_mul_i32 s5, s6, s5
4038; GFX1032-NEXT:    s_mov_b32 s10, -1
4039; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
4040; GFX1032-NEXT:    s_mov_b32 s8, s2
4041; GFX1032-NEXT:    s_mov_b32 s9, s3
4042; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
4043; GFX1032-NEXT:    s_waitcnt vmcnt(0)
4044; GFX1032-NEXT:    buffer_gl1_inv
4045; GFX1032-NEXT:    buffer_gl0_inv
4046; GFX1032-NEXT:  .LBB7_2:
4047; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
4048; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4049; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
4050; GFX1032-NEXT:    v_mul_lo_u32 v0, s6, v0
4051; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
4052; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
4053; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4054; GFX1032-NEXT:    s_mov_b32 s2, -1
4055; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4056; GFX1032-NEXT:    s_endpgm
4057;
4058; GFX1164-LABEL: sub_i32_uniform:
4059; GFX1164:       ; %bb.0: ; %entry
4060; GFX1164-NEXT:    s_clause 0x1
4061; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4062; GFX1164-NEXT:    s_load_b32 s8, s[4:5], 0x34
4063; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
4064; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
4065; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
4066; GFX1164-NEXT:    ; implicit-def: $vgpr1
4067; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4068; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
4069; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
4070; GFX1164-NEXT:    s_cbranch_execz .LBB7_2
4071; GFX1164-NEXT:  ; %bb.1:
4072; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
4073; GFX1164-NEXT:    s_mov_b32 s15, 0x31016000
4074; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4075; GFX1164-NEXT:    s_mul_i32 s6, s8, s6
4076; GFX1164-NEXT:    s_mov_b32 s14, -1
4077; GFX1164-NEXT:    v_mov_b32_e32 v1, s6
4078; GFX1164-NEXT:    s_mov_b32 s12, s2
4079; GFX1164-NEXT:    s_mov_b32 s13, s3
4080; GFX1164-NEXT:    buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc
4081; GFX1164-NEXT:    s_waitcnt vmcnt(0)
4082; GFX1164-NEXT:    buffer_gl1_inv
4083; GFX1164-NEXT:    buffer_gl0_inv
4084; GFX1164-NEXT:  .LBB7_2:
4085; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
4086; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
4087; GFX1164-NEXT:    v_mul_lo_u32 v0, s8, v0
4088; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
4089; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
4090; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4091; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4092; GFX1164-NEXT:    s_mov_b32 s2, -1
4093; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4094; GFX1164-NEXT:    s_endpgm
4095;
4096; GFX1132-LABEL: sub_i32_uniform:
4097; GFX1132:       ; %bb.0: ; %entry
4098; GFX1132-NEXT:    s_clause 0x1
4099; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4100; GFX1132-NEXT:    s_load_b32 s4, s[4:5], 0x34
4101; GFX1132-NEXT:    s_mov_b32 s6, exec_lo
4102; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
4103; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
4104; GFX1132-NEXT:    ; implicit-def: $vgpr1
4105; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4106; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
4107; GFX1132-NEXT:    s_cbranch_execz .LBB7_2
4108; GFX1132-NEXT:  ; %bb.1:
4109; GFX1132-NEXT:    s_bcnt1_i32_b32 s6, s6
4110; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
4111; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4112; GFX1132-NEXT:    s_mul_i32 s6, s4, s6
4113; GFX1132-NEXT:    s_mov_b32 s10, -1
4114; GFX1132-NEXT:    v_mov_b32_e32 v1, s6
4115; GFX1132-NEXT:    s_mov_b32 s8, s2
4116; GFX1132-NEXT:    s_mov_b32 s9, s3
4117; GFX1132-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
4118; GFX1132-NEXT:    s_waitcnt vmcnt(0)
4119; GFX1132-NEXT:    buffer_gl1_inv
4120; GFX1132-NEXT:    buffer_gl0_inv
4121; GFX1132-NEXT:  .LBB7_2:
4122; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s5
4123; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
4124; GFX1132-NEXT:    v_mul_lo_u32 v0, s4, v0
4125; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
4126; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
4127; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4128; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4129; GFX1132-NEXT:    s_mov_b32 s2, -1
4130; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4131; GFX1132-NEXT:    s_endpgm
4132;
4133; GFX1264-LABEL: sub_i32_uniform:
4134; GFX1264:       ; %bb.0: ; %entry
4135; GFX1264-NEXT:    s_clause 0x1
4136; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4137; GFX1264-NEXT:    s_load_b32 s8, s[4:5], 0x34
4138; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
4139; GFX1264-NEXT:    s_mov_b64 s[4:5], exec
4140; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
4141; GFX1264-NEXT:    ; implicit-def: $vgpr1
4142; GFX1264-NEXT:    s_wait_alu 0xfffe
4143; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4144; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
4145; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v0
4146; GFX1264-NEXT:    s_cbranch_execz .LBB7_2
4147; GFX1264-NEXT:  ; %bb.1:
4148; GFX1264-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
4149; GFX1264-NEXT:    s_mov_b32 s15, 0x31016000
4150; GFX1264-NEXT:    s_wait_kmcnt 0x0
4151; GFX1264-NEXT:    s_wait_alu 0xfffe
4152; GFX1264-NEXT:    s_mul_i32 s6, s8, s6
4153; GFX1264-NEXT:    s_mov_b32 s14, -1
4154; GFX1264-NEXT:    s_wait_alu 0xfffe
4155; GFX1264-NEXT:    v_mov_b32_e32 v1, s6
4156; GFX1264-NEXT:    s_mov_b32 s12, s2
4157; GFX1264-NEXT:    s_mov_b32 s13, s3
4158; GFX1264-NEXT:    buffer_atomic_sub_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
4159; GFX1264-NEXT:    s_wait_loadcnt 0x0
4160; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
4161; GFX1264-NEXT:  .LBB7_2:
4162; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
4163; GFX1264-NEXT:    s_wait_kmcnt 0x0
4164; GFX1264-NEXT:    v_mul_lo_u32 v0, s8, v0
4165; GFX1264-NEXT:    v_readfirstlane_b32 s2, v1
4166; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
4167; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4168; GFX1264-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4169; GFX1264-NEXT:    s_mov_b32 s2, -1
4170; GFX1264-NEXT:    buffer_store_b32 v0, off, s[0:3], null
4171; GFX1264-NEXT:    s_endpgm
4172;
4173; GFX1232-LABEL: sub_i32_uniform:
4174; GFX1232:       ; %bb.0: ; %entry
4175; GFX1232-NEXT:    s_clause 0x1
4176; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4177; GFX1232-NEXT:    s_load_b32 s4, s[4:5], 0x34
4178; GFX1232-NEXT:    s_mov_b32 s6, exec_lo
4179; GFX1232-NEXT:    s_mov_b32 s5, exec_lo
4180; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
4181; GFX1232-NEXT:    ; implicit-def: $vgpr1
4182; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4183; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v0
4184; GFX1232-NEXT:    s_cbranch_execz .LBB7_2
4185; GFX1232-NEXT:  ; %bb.1:
4186; GFX1232-NEXT:    s_wait_alu 0xfffe
4187; GFX1232-NEXT:    s_bcnt1_i32_b32 s6, s6
4188; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
4189; GFX1232-NEXT:    s_wait_kmcnt 0x0
4190; GFX1232-NEXT:    s_wait_alu 0xfffe
4191; GFX1232-NEXT:    s_mul_i32 s6, s4, s6
4192; GFX1232-NEXT:    s_mov_b32 s10, -1
4193; GFX1232-NEXT:    s_wait_alu 0xfffe
4194; GFX1232-NEXT:    v_mov_b32_e32 v1, s6
4195; GFX1232-NEXT:    s_mov_b32 s8, s2
4196; GFX1232-NEXT:    s_mov_b32 s9, s3
4197; GFX1232-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
4198; GFX1232-NEXT:    s_wait_loadcnt 0x0
4199; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
4200; GFX1232-NEXT:  .LBB7_2:
4201; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s5
4202; GFX1232-NEXT:    s_wait_kmcnt 0x0
4203; GFX1232-NEXT:    v_mul_lo_u32 v0, s4, v0
4204; GFX1232-NEXT:    v_readfirstlane_b32 s2, v1
4205; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
4206; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4207; GFX1232-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4208; GFX1232-NEXT:    s_mov_b32 s2, -1
4209; GFX1232-NEXT:    buffer_store_b32 v0, off, s[0:3], null
4210; GFX1232-NEXT:    s_endpgm
4211entry:
4212  %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel
4213  store i32 %old, ptr addrspace(1) %out
4214  ret void
4215}
4216
4217define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
4218; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying:
4219; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
4220; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4221; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s6, 0
4222; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
4223; GFX7LESS_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
4224; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4225; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
4226; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s2
4227; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
4228; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
4229; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
4230; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
4231; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
4232; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
4233; GFX7LESS_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
4234; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB8_1
4235; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4236; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4237; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4238; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4239; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4240; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
4241; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4242; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4243; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
4244; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
4245; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
4246; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s10, -1
4247; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4248; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s8, s2
4249; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s9, s3
4250; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
4251; GFX7LESS_ITERATIVE-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
4252; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
4253; GFX7LESS_ITERATIVE-NEXT:    buffer_wbinvl1
4254; GFX7LESS_ITERATIVE-NEXT:  .LBB8_4:
4255; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
4256; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4257; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
4258; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
4259; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
4260; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt expcnt(0)
4261; GFX7LESS_ITERATIVE-NEXT:    v_sub_i32_e32 v0, vcc, s4, v1
4262; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4263; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
4264;
4265; GFX8_ITERATIVE-LABEL: sub_i32_varying:
4266; GFX8_ITERATIVE:       ; %bb.0: ; %entry
4267; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4268; GFX8_ITERATIVE-NEXT:    s_mov_b32 s6, 0
4269; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
4270; GFX8_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
4271; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4272; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
4273; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s2
4274; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
4275; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
4276; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
4277; GFX8_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
4278; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
4279; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
4280; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
4281; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4282; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4283; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4284; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4285; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4286; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
4287; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4288; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4289; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
4290; GFX8_ITERATIVE-NEXT:  ; %bb.3:
4291; GFX8_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
4292; GFX8_ITERATIVE-NEXT:    s_mov_b32 s10, -1
4293; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4294; GFX8_ITERATIVE-NEXT:    s_mov_b32 s8, s2
4295; GFX8_ITERATIVE-NEXT:    s_mov_b32 s9, s3
4296; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
4297; GFX8_ITERATIVE-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
4298; GFX8_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
4299; GFX8_ITERATIVE-NEXT:    buffer_wbinvl1_vol
4300; GFX8_ITERATIVE-NEXT:  .LBB8_4:
4301; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
4302; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
4303; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4304; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
4305; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
4306; GFX8_ITERATIVE-NEXT:    v_sub_u32_e32 v0, vcc, s4, v1
4307; GFX8_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4308; GFX8_ITERATIVE-NEXT:    s_endpgm
4309;
4310; GFX9_ITERATIVE-LABEL: sub_i32_varying:
4311; GFX9_ITERATIVE:       ; %bb.0: ; %entry
4312; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4313; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, 0
4314; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
4315; GFX9_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
4316; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4317; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
4318; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s2
4319; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
4320; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
4321; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
4322; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
4323; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
4324; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
4325; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
4326; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4327; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4328; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4329; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4330; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4331; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
4332; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4333; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4334; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
4335; GFX9_ITERATIVE-NEXT:  ; %bb.3:
4336; GFX9_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
4337; GFX9_ITERATIVE-NEXT:    s_mov_b32 s10, -1
4338; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4339; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s2
4340; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s3
4341; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
4342; GFX9_ITERATIVE-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
4343; GFX9_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
4344; GFX9_ITERATIVE-NEXT:    buffer_wbinvl1_vol
4345; GFX9_ITERATIVE-NEXT:  .LBB8_4:
4346; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
4347; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
4348; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4349; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
4350; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
4351; GFX9_ITERATIVE-NEXT:    v_sub_u32_e32 v0, s4, v1
4352; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4353; GFX9_ITERATIVE-NEXT:    s_endpgm
4354;
4355; GFX1064_ITERATIVE-LABEL: sub_i32_varying:
4356; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
4357; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4358; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, 0
4359; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
4360; GFX1064_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
4361; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4362; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s7, s[0:1]
4363; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s7
4364; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
4365; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s7
4366; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
4367; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
4368; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
4369; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
4370; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4371; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4372; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4373; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4374; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4375; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
4376; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4377; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4378; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
4379; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
4380; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
4381; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
4382; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s10, -1
4383; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4384; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s2
4385; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s3
4386; GFX1064_ITERATIVE-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
4387; GFX1064_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
4388; GFX1064_ITERATIVE-NEXT:    buffer_gl1_inv
4389; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
4390; GFX1064_ITERATIVE-NEXT:  .LBB8_4:
4391; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
4392; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
4393; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4394; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
4395; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
4396; GFX1064_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
4397; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
4398; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4399; GFX1064_ITERATIVE-NEXT:    s_endpgm
4400;
4401; GFX1032_ITERATIVE-LABEL: sub_i32_varying:
4402; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
4403; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
4404; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s6, 0
4405; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
4406; GFX1032_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
4407; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4408; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
4409; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
4410; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
4411; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
4412; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s3
4413; GFX1032_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
4414; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
4415; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
4416; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4417; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4418; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4419; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4420; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
4421; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4422; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s4, exec_lo, s4
4423; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
4424; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
4425; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
4426; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
4427; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s10, -1
4428; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4429; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s2
4430; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s3
4431; GFX1032_ITERATIVE-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
4432; GFX1032_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
4433; GFX1032_ITERATIVE-NEXT:    buffer_gl1_inv
4434; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
4435; GFX1032_ITERATIVE-NEXT:  .LBB8_4:
4436; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
4437; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4438; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4439; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
4440; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
4441; GFX1032_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
4442; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
4443; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4444; GFX1032_ITERATIVE-NEXT:    s_endpgm
4445;
4446; GFX1164_ITERATIVE-LABEL: sub_i32_varying:
4447; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
4448; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
4449; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4450; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s6, 0
4451; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
4452; GFX1164_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
4453; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4454; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
4455; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
4456; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
4457; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
4458; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
4459; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
4460; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4461; GFX1164_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
4462; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
4463; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
4464; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4465; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4466; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
4467; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4468; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
4469; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
4470; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
4471; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4472; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4473; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4474; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
4475; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
4476; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s6
4477; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
4478; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s10, -1
4479; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4480; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s8, s2
4481; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s9, s3
4482; GFX1164_ITERATIVE-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
4483; GFX1164_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
4484; GFX1164_ITERATIVE-NEXT:    buffer_gl1_inv
4485; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
4486; GFX1164_ITERATIVE-NEXT:  .LBB8_4:
4487; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
4488; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4489; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
4490; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
4491; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4492; GFX1164_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4493; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
4494; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4495; GFX1164_ITERATIVE-NEXT:    s_endpgm
4496;
4497; GFX1132_ITERATIVE-LABEL: sub_i32_varying:
4498; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
4499; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
4500; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
4501; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s6, 0
4502; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
4503; GFX1132_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
4504; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4505; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
4506; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
4507; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
4508; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
4509; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
4510; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
4511; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4512; GFX1132_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
4513; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
4514; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
4515; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4516; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4517; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
4518; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4519; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
4520; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
4521; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4522; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s4, exec_lo, s4
4523; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
4524; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
4525; GFX1132_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s6
4526; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
4527; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s10, -1
4528; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4529; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s8, s2
4530; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s9, s3
4531; GFX1132_ITERATIVE-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
4532; GFX1132_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
4533; GFX1132_ITERATIVE-NEXT:    buffer_gl1_inv
4534; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
4535; GFX1132_ITERATIVE-NEXT:  .LBB8_4:
4536; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4537; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4538; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
4539; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
4540; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4541; GFX1132_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4542; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
4543; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4544; GFX1132_ITERATIVE-NEXT:    s_endpgm
4545;
4546; GFX1264_ITERATIVE-LABEL: sub_i32_varying:
4547; GFX1264_ITERATIVE:       ; %bb.0: ; %entry
4548; GFX1264_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
4549; GFX1264_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4550; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s6, 0
4551; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
4552; GFX1264_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
4553; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4554; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
4555; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
4556; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
4557; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
4558; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
4559; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
4560; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4561; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s8
4562; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
4563; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
4564; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4565; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4566; GFX1264_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
4567; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4568; GFX1264_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
4569; GFX1264_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
4570; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
4571; GFX1264_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4572; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4573; GFX1264_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4574; GFX1264_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
4575; GFX1264_ITERATIVE-NEXT:  ; %bb.3:
4576; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
4577; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s6
4578; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
4579; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s10, -1
4580; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
4581; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s8, s2
4582; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s9, s3
4583; GFX1264_ITERATIVE-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
4584; GFX1264_ITERATIVE-NEXT:    s_wait_loadcnt 0x0
4585; GFX1264_ITERATIVE-NEXT:    global_inv scope:SCOPE_DEV
4586; GFX1264_ITERATIVE-NEXT:  .LBB8_4:
4587; GFX1264_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
4588; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
4589; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
4590; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
4591; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4592; GFX1264_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4593; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
4594; GFX1264_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
4595; GFX1264_ITERATIVE-NEXT:    s_endpgm
4596;
4597; GFX1232_ITERATIVE-LABEL: sub_i32_varying:
4598; GFX1232_ITERATIVE:       ; %bb.0: ; %entry
4599; GFX1232_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
4600; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
4601; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s6, 0
4602; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
4603; GFX1232_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
4604; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4605; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
4606; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
4607; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
4608; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
4609; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
4610; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
4611; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
4612; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4613; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s2
4614; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
4615; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
4616; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
4617; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4618; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4619; GFX1232_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
4620; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4621; GFX1232_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
4622; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
4623; GFX1232_ITERATIVE-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4624; GFX1232_ITERATIVE-NEXT:    s_xor_b32 s4, exec_lo, s4
4625; GFX1232_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
4626; GFX1232_ITERATIVE-NEXT:  ; %bb.3:
4627; GFX1232_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s6
4628; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
4629; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s10, -1
4630; GFX1232_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
4631; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s8, s2
4632; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s9, s3
4633; GFX1232_ITERATIVE-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
4634; GFX1232_ITERATIVE-NEXT:    s_wait_loadcnt 0x0
4635; GFX1232_ITERATIVE-NEXT:    global_inv scope:SCOPE_DEV
4636; GFX1232_ITERATIVE-NEXT:  .LBB8_4:
4637; GFX1232_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4638; GFX1232_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
4639; GFX1232_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
4640; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
4641; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4642; GFX1232_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4643; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s2, -1
4644; GFX1232_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
4645; GFX1232_ITERATIVE-NEXT:    s_endpgm
4646;
4647; GFX7LESS_DPP-LABEL: sub_i32_varying:
4648; GFX7LESS_DPP:       ; %bb.0: ; %entry
4649; GFX7LESS_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4650; GFX7LESS_DPP-NEXT:    s_mov_b32 s7, 0xf000
4651; GFX7LESS_DPP-NEXT:    s_mov_b32 s6, -1
4652; GFX7LESS_DPP-NEXT:    s_mov_b32 s10, s6
4653; GFX7LESS_DPP-NEXT:    s_mov_b32 s11, s7
4654; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4655; GFX7LESS_DPP-NEXT:    s_mov_b32 s8, s2
4656; GFX7LESS_DPP-NEXT:    s_mov_b32 s9, s3
4657; GFX7LESS_DPP-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
4658; GFX7LESS_DPP-NEXT:    s_waitcnt vmcnt(0)
4659; GFX7LESS_DPP-NEXT:    buffer_wbinvl1
4660; GFX7LESS_DPP-NEXT:    s_mov_b32 s4, s0
4661; GFX7LESS_DPP-NEXT:    s_mov_b32 s5, s1
4662; GFX7LESS_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4663; GFX7LESS_DPP-NEXT:    s_endpgm
4664;
4665; GFX8_DPP-LABEL: sub_i32_varying:
4666; GFX8_DPP:       ; %bb.0: ; %entry
4667; GFX8_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4668; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
4669; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
4670; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
4671; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4672; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4673; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
4674; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[4:5]
4675; GFX8_DPP-NEXT:    s_nop 1
4676; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4677; GFX8_DPP-NEXT:    s_nop 1
4678; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4679; GFX8_DPP-NEXT:    s_nop 1
4680; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4681; GFX8_DPP-NEXT:    s_nop 1
4682; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4683; GFX8_DPP-NEXT:    s_nop 1
4684; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4685; GFX8_DPP-NEXT:    s_nop 1
4686; GFX8_DPP-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4687; GFX8_DPP-NEXT:    v_readlane_b32 s6, v2, 63
4688; GFX8_DPP-NEXT:    s_nop 0
4689; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4690; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
4691; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4692; GFX8_DPP-NEXT:    ; implicit-def: $vgpr0
4693; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4694; GFX8_DPP-NEXT:    s_cbranch_execz .LBB8_2
4695; GFX8_DPP-NEXT:  ; %bb.1:
4696; GFX8_DPP-NEXT:    s_mov_b32 s11, 0xf000
4697; GFX8_DPP-NEXT:    s_mov_b32 s10, -1
4698; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4699; GFX8_DPP-NEXT:    s_mov_b32 s8, s2
4700; GFX8_DPP-NEXT:    s_mov_b32 s9, s3
4701; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s6
4702; GFX8_DPP-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
4703; GFX8_DPP-NEXT:    s_waitcnt vmcnt(0)
4704; GFX8_DPP-NEXT:    buffer_wbinvl1_vol
4705; GFX8_DPP-NEXT:  .LBB8_2:
4706; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
4707; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v0
4708; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, v1
4709; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4710; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
4711; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
4712; GFX8_DPP-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
4713; GFX8_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4714; GFX8_DPP-NEXT:    s_endpgm
4715;
4716; GFX9_DPP-LABEL: sub_i32_varying:
4717; GFX9_DPP:       ; %bb.0: ; %entry
4718; GFX9_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4719; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
4720; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
4721; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
4722; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4723; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4724; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
4725; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[4:5]
4726; GFX9_DPP-NEXT:    s_nop 1
4727; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4728; GFX9_DPP-NEXT:    s_nop 1
4729; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4730; GFX9_DPP-NEXT:    s_nop 1
4731; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4732; GFX9_DPP-NEXT:    s_nop 1
4733; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4734; GFX9_DPP-NEXT:    s_nop 1
4735; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
4736; GFX9_DPP-NEXT:    s_nop 1
4737; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
4738; GFX9_DPP-NEXT:    v_readlane_b32 s6, v2, 63
4739; GFX9_DPP-NEXT:    s_nop 0
4740; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
4741; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
4742; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4743; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
4744; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4745; GFX9_DPP-NEXT:    s_cbranch_execz .LBB8_2
4746; GFX9_DPP-NEXT:  ; %bb.1:
4747; GFX9_DPP-NEXT:    s_mov_b32 s11, 0xf000
4748; GFX9_DPP-NEXT:    s_mov_b32 s10, -1
4749; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4750; GFX9_DPP-NEXT:    s_mov_b32 s8, s2
4751; GFX9_DPP-NEXT:    s_mov_b32 s9, s3
4752; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s6
4753; GFX9_DPP-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
4754; GFX9_DPP-NEXT:    s_waitcnt vmcnt(0)
4755; GFX9_DPP-NEXT:    buffer_wbinvl1_vol
4756; GFX9_DPP-NEXT:  .LBB8_2:
4757; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
4758; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
4759; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v1
4760; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4761; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
4762; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
4763; GFX9_DPP-NEXT:    v_sub_u32_e32 v0, s4, v0
4764; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4765; GFX9_DPP-NEXT:    s_endpgm
4766;
4767; GFX1064_DPP-LABEL: sub_i32_varying:
4768; GFX1064_DPP:       ; %bb.0: ; %entry
4769; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4770; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
4771; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
4772; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4773; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4774; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4775; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4776; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
4777; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4778; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
4779; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s2
4780; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4781; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 15
4782; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4783; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4784; GFX1064_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4785; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
4786; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
4787; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s6, 16
4788; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
4789; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4790; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
4791; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
4792; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v1, 63
4793; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s7, 32
4794; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
4795; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4796; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
4797; GFX1064_DPP-NEXT:    s_mov_b32 s4, s9
4798; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s8, 48
4799; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
4800; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4801; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
4802; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
4803; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
4804; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB8_2
4805; GFX1064_DPP-NEXT:  ; %bb.1:
4806; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s4
4807; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
4808; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4809; GFX1064_DPP-NEXT:    s_mov_b32 s4, s2
4810; GFX1064_DPP-NEXT:    s_mov_b32 s5, s3
4811; GFX1064_DPP-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
4812; GFX1064_DPP-NEXT:    s_waitcnt vmcnt(0)
4813; GFX1064_DPP-NEXT:    buffer_gl1_inv
4814; GFX1064_DPP-NEXT:    buffer_gl0_inv
4815; GFX1064_DPP-NEXT:  .LBB8_2:
4816; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
4817; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
4818; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4819; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s2, v0
4820; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
4821; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
4822; GFX1064_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4823; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
4824; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4825; GFX1064_DPP-NEXT:    s_endpgm
4826;
4827; GFX1032_DPP-LABEL: sub_i32_varying:
4828; GFX1032_DPP:       ; %bb.0: ; %entry
4829; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
4830; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
4831; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4832; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4833; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4834; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4835; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
4836; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
4837; GFX1032_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4838; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
4839; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4840; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
4841; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v1, 31
4842; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4843; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
4844; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
4845; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4846; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
4847; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s5, 16
4848; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
4849; GFX1032_DPP-NEXT:    s_mov_b32 s4, s6
4850; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
4851; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4852; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
4853; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
4854; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB8_2
4855; GFX1032_DPP-NEXT:  ; %bb.1:
4856; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s4
4857; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
4858; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4859; GFX1032_DPP-NEXT:    s_mov_b32 s4, s2
4860; GFX1032_DPP-NEXT:    s_mov_b32 s5, s3
4861; GFX1032_DPP-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
4862; GFX1032_DPP-NEXT:    s_waitcnt vmcnt(0)
4863; GFX1032_DPP-NEXT:    buffer_gl1_inv
4864; GFX1032_DPP-NEXT:    buffer_gl0_inv
4865; GFX1032_DPP-NEXT:  .LBB8_2:
4866; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
4867; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
4868; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4869; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s2, v0
4870; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
4871; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
4872; GFX1032_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4873; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
4874; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4875; GFX1032_DPP-NEXT:    s_endpgm
4876;
4877; GFX1164_DPP-LABEL: sub_i32_varying:
4878; GFX1164_DPP:       ; %bb.0: ; %entry
4879; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4880; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4881; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
4882; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
4883; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
4884; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
4885; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4886; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4887; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4888; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4889; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4890; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4891; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
4892; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4893; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4894; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
4895; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s2
4896; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4897; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4898; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 15
4899; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4900; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4901; GFX1164_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4902; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
4903; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
4904; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s6, 16
4905; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
4906; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4907; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4908; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
4909; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
4910; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v1, 63
4911; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s7, 32
4912; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
4913; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4914; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4915; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
4916; GFX1164_DPP-NEXT:    s_mov_b32 s4, s9
4917; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s8, 48
4918; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
4919; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4920; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
4921; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr0
4922; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
4923; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB8_2
4924; GFX1164_DPP-NEXT:  ; %bb.1:
4925; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, s4
4926; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
4927; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4928; GFX1164_DPP-NEXT:    s_mov_b32 s4, s2
4929; GFX1164_DPP-NEXT:    s_mov_b32 s5, s3
4930; GFX1164_DPP-NEXT:    buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
4931; GFX1164_DPP-NEXT:    s_waitcnt vmcnt(0)
4932; GFX1164_DPP-NEXT:    buffer_gl1_inv
4933; GFX1164_DPP-NEXT:    buffer_gl0_inv
4934; GFX1164_DPP-NEXT:  .LBB8_2:
4935; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
4936; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4937; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s2, v0
4938; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, v3
4939; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
4940; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4941; GFX1164_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4942; GFX1164_DPP-NEXT:    s_mov_b32 s2, s6
4943; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4944; GFX1164_DPP-NEXT:    s_endpgm
4945;
4946; GFX1132_DPP-LABEL: sub_i32_varying:
4947; GFX1132_DPP:       ; %bb.0: ; %entry
4948; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4949; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
4950; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
4951; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
4952; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4953; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4954; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4955; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4956; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4957; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4958; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
4959; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
4960; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
4961; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4962; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
4963; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4964; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
4965; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
4966; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 31
4967; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4968; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 15
4969; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
4970; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4971; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4972; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
4973; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s5, 16
4974; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
4975; GFX1132_DPP-NEXT:    s_mov_b32 s4, s6
4976; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
4977; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4978; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
4979; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
4980; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB8_2
4981; GFX1132_DPP-NEXT:  ; %bb.1:
4982; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s4
4983; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
4984; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4985; GFX1132_DPP-NEXT:    s_mov_b32 s4, s2
4986; GFX1132_DPP-NEXT:    s_mov_b32 s5, s3
4987; GFX1132_DPP-NEXT:    buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
4988; GFX1132_DPP-NEXT:    s_waitcnt vmcnt(0)
4989; GFX1132_DPP-NEXT:    buffer_gl1_inv
4990; GFX1132_DPP-NEXT:    buffer_gl0_inv
4991; GFX1132_DPP-NEXT:  .LBB8_2:
4992; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
4993; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4994; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s2, v0
4995; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
4996; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
4997; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4998; GFX1132_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4999; GFX1132_DPP-NEXT:    s_mov_b32 s2, s6
5000; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
5001; GFX1132_DPP-NEXT:    s_endpgm
5002;
5003; GFX1264_DPP-LABEL: sub_i32_varying:
5004; GFX1264_DPP:       ; %bb.0: ; %entry
5005; GFX1264_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
5006; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
5007; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
5008; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
5009; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, 0
5010; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
5011; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5012; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5013; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5014; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5015; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5016; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5017; GFX1264_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
5018; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5019; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5020; GFX1264_DPP-NEXT:    v_readlane_b32 s2, v1, 31
5021; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, s2
5022; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5023; GFX1264_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5024; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v1, 15
5025; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5026; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
5027; GFX1264_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5028; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
5029; GFX1264_DPP-NEXT:    v_readlane_b32 s7, v1, 31
5030; GFX1264_DPP-NEXT:    v_writelane_b32 v3, s6, 16
5031; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[4:5]
5032; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5033; GFX1264_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5034; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
5035; GFX1264_DPP-NEXT:    v_readlane_b32 s8, v1, 47
5036; GFX1264_DPP-NEXT:    v_readlane_b32 s9, v1, 63
5037; GFX1264_DPP-NEXT:    v_writelane_b32 v3, s7, 32
5038; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[4:5]
5039; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5040; GFX1264_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5041; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
5042; GFX1264_DPP-NEXT:    s_mov_b32 s4, s9
5043; GFX1264_DPP-NEXT:    v_writelane_b32 v3, s8, 48
5044; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
5045; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[6:7]
5046; GFX1264_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5047; GFX1264_DPP-NEXT:    s_mov_b32 s6, -1
5048; GFX1264_DPP-NEXT:    ; implicit-def: $vgpr0
5049; GFX1264_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
5050; GFX1264_DPP-NEXT:    s_cbranch_execz .LBB8_2
5051; GFX1264_DPP-NEXT:  ; %bb.1:
5052; GFX1264_DPP-NEXT:    v_mov_b32_e32 v0, s4
5053; GFX1264_DPP-NEXT:    s_mov_b32 s7, 0x31016000
5054; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
5055; GFX1264_DPP-NEXT:    s_mov_b32 s4, s2
5056; GFX1264_DPP-NEXT:    s_mov_b32 s5, s3
5057; GFX1264_DPP-NEXT:    buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
5058; GFX1264_DPP-NEXT:    s_wait_loadcnt 0x0
5059; GFX1264_DPP-NEXT:    global_inv scope:SCOPE_DEV
5060; GFX1264_DPP-NEXT:  .LBB8_2:
5061; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
5062; GFX1264_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
5063; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
5064; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s2, v0
5065; GFX1264_DPP-NEXT:    v_mov_b32_e32 v0, v3
5066; GFX1264_DPP-NEXT:    s_mov_b32 s3, 0x31016000
5067; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5068; GFX1264_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
5069; GFX1264_DPP-NEXT:    s_mov_b32 s2, s6
5070; GFX1264_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], null
5071; GFX1264_DPP-NEXT:    s_endpgm
5072;
5073; GFX1232_DPP-LABEL: sub_i32_varying:
5074; GFX1232_DPP:       ; %bb.0: ; %entry
5075; GFX1232_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
5076; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
5077; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
5078; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
5079; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5080; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
5081; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
5082; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5083; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
5084; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
5085; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
5086; GFX1232_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
5087; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
5088; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5089; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
5090; GFX1232_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5091; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, 0
5092; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
5093; GFX1232_DPP-NEXT:    v_readlane_b32 s6, v1, 31
5094; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5095; GFX1232_DPP-NEXT:    v_readlane_b32 s5, v1, 15
5096; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
5097; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5098; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5099; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s4, -1
5100; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
5101; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
5102; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
5103; GFX1232_DPP-NEXT:    s_mov_b32 s4, s6
5104; GFX1232_DPP-NEXT:    s_mov_b32 s6, -1
5105; GFX1232_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5106; GFX1232_DPP-NEXT:    ; implicit-def: $vgpr0
5107; GFX1232_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
5108; GFX1232_DPP-NEXT:    s_cbranch_execz .LBB8_2
5109; GFX1232_DPP-NEXT:  ; %bb.1:
5110; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
5111; GFX1232_DPP-NEXT:    v_mov_b32_e32 v0, s4
5112; GFX1232_DPP-NEXT:    s_mov_b32 s7, 0x31016000
5113; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
5114; GFX1232_DPP-NEXT:    s_mov_b32 s4, s2
5115; GFX1232_DPP-NEXT:    s_mov_b32 s5, s3
5116; GFX1232_DPP-NEXT:    buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
5117; GFX1232_DPP-NEXT:    s_wait_loadcnt 0x0
5118; GFX1232_DPP-NEXT:    global_inv scope:SCOPE_DEV
5119; GFX1232_DPP-NEXT:  .LBB8_2:
5120; GFX1232_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
5121; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
5122; GFX1232_DPP-NEXT:    v_readfirstlane_b32 s2, v0
5123; GFX1232_DPP-NEXT:    v_mov_b32_e32 v0, v3
5124; GFX1232_DPP-NEXT:    s_mov_b32 s3, 0x31016000
5125; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5126; GFX1232_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
5127; GFX1232_DPP-NEXT:    s_mov_b32 s2, s6
5128; GFX1232_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], null
5129; GFX1232_DPP-NEXT:    s_endpgm
5130entry:
5131  %lane = call i32 @llvm.amdgcn.workitem.id.x()
5132  %old = atomicrmw sub ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel
5133  store i32 %old, ptr addrspace(1) %out
5134  ret void
5135}
5136
5137define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
5138; GFX7LESS-LABEL: sub_i64_constant:
5139; GFX7LESS:       ; %bb.0: ; %entry
5140; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
5141; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5142; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
5143; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
5144; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5145; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
5146; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5147; GFX7LESS-NEXT:    s_cbranch_execz .LBB9_2
5148; GFX7LESS-NEXT:  ; %bb.1:
5149; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
5150; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
5151; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
5152; GFX7LESS-NEXT:    s_mov_b32 s10, -1
5153; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5154; GFX7LESS-NEXT:    s_mov_b32 s8, s2
5155; GFX7LESS-NEXT:    s_mov_b32 s9, s3
5156; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
5157; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
5158; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
5159; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
5160; GFX7LESS-NEXT:    buffer_wbinvl1
5161; GFX7LESS-NEXT:  .LBB9_2:
5162; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
5163; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5164; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5165; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5166; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
5167; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v0
5168; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
5169; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
5170; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
5171; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
5172; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
5173; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
5174; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5175; GFX7LESS-NEXT:    s_endpgm
5176;
5177; GFX8-LABEL: sub_i64_constant:
5178; GFX8:       ; %bb.0: ; %entry
5179; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5180; GFX8-NEXT:    s_mov_b64 s[6:7], exec
5181; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
5182; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
5183; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5184; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5185; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5186; GFX8-NEXT:    s_cbranch_execz .LBB9_2
5187; GFX8-NEXT:  ; %bb.1:
5188; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5189; GFX8-NEXT:    s_mov_b32 s8, s2
5190; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
5191; GFX8-NEXT:    s_mul_i32 s2, s2, 5
5192; GFX8-NEXT:    s_mov_b32 s11, 0xf000
5193; GFX8-NEXT:    s_mov_b32 s10, -1
5194; GFX8-NEXT:    s_mov_b32 s9, s3
5195; GFX8-NEXT:    v_mov_b32_e32 v0, s2
5196; GFX8-NEXT:    v_mov_b32_e32 v1, 0
5197; GFX8-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
5198; GFX8-NEXT:    s_waitcnt vmcnt(0)
5199; GFX8-NEXT:    buffer_wbinvl1_vol
5200; GFX8-NEXT:  .LBB9_2:
5201; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5202; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
5203; GFX8-NEXT:    v_readfirstlane_b32 s5, v0
5204; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
5205; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
5206; GFX8-NEXT:    v_mov_b32_e32 v2, s4
5207; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s5, v0
5208; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5209; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5210; GFX8-NEXT:    s_mov_b32 s2, -1
5211; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
5212; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5213; GFX8-NEXT:    s_endpgm
5214;
5215; GFX9-LABEL: sub_i64_constant:
5216; GFX9:       ; %bb.0: ; %entry
5217; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5218; GFX9-NEXT:    s_mov_b64 s[6:7], exec
5219; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
5220; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
5221; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5222; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5223; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5224; GFX9-NEXT:    s_cbranch_execz .LBB9_2
5225; GFX9-NEXT:  ; %bb.1:
5226; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5227; GFX9-NEXT:    s_mov_b32 s8, s2
5228; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
5229; GFX9-NEXT:    s_mul_i32 s2, s2, 5
5230; GFX9-NEXT:    s_mov_b32 s11, 0xf000
5231; GFX9-NEXT:    s_mov_b32 s10, -1
5232; GFX9-NEXT:    s_mov_b32 s9, s3
5233; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5234; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5235; GFX9-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
5236; GFX9-NEXT:    s_waitcnt vmcnt(0)
5237; GFX9-NEXT:    buffer_wbinvl1_vol
5238; GFX9-NEXT:  .LBB9_2:
5239; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5240; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
5241; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
5242; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
5243; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
5244; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5245; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v0
5246; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5247; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5248; GFX9-NEXT:    s_mov_b32 s2, -1
5249; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
5250; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5251; GFX9-NEXT:    s_endpgm
5252;
5253; GFX1064-LABEL: sub_i64_constant:
5254; GFX1064:       ; %bb.0: ; %entry
5255; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5256; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
5257; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
5258; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
5259; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5260; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5261; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5262; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
5263; GFX1064-NEXT:  ; %bb.1:
5264; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
5265; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5266; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
5267; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
5268; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
5269; GFX1064-NEXT:    s_mov_b32 s10, -1
5270; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5271; GFX1064-NEXT:    s_mov_b32 s8, s2
5272; GFX1064-NEXT:    s_mov_b32 s9, s3
5273; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
5274; GFX1064-NEXT:    s_waitcnt vmcnt(0)
5275; GFX1064-NEXT:    buffer_gl1_inv
5276; GFX1064-NEXT:    buffer_gl0_inv
5277; GFX1064-NEXT:  .LBB9_2:
5278; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5279; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5280; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5281; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5282; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
5283; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
5284; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
5285; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
5286; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
5287; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5288; GFX1064-NEXT:    s_mov_b32 s2, -1
5289; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5290; GFX1064-NEXT:    s_endpgm
5291;
5292; GFX1032-LABEL: sub_i64_constant:
5293; GFX1032:       ; %bb.0: ; %entry
5294; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5295; GFX1032-NEXT:    s_mov_b32 s6, exec_lo
5296; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5297; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
5298; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
5299; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
5300; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
5301; GFX1032-NEXT:  ; %bb.1:
5302; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s6
5303; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5304; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
5305; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
5306; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
5307; GFX1032-NEXT:    s_mov_b32 s10, -1
5308; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5309; GFX1032-NEXT:    s_mov_b32 s8, s2
5310; GFX1032-NEXT:    s_mov_b32 s9, s3
5311; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
5312; GFX1032-NEXT:    s_waitcnt vmcnt(0)
5313; GFX1032-NEXT:    buffer_gl1_inv
5314; GFX1032-NEXT:    buffer_gl0_inv
5315; GFX1032-NEXT:  .LBB9_2:
5316; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5317; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5318; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5319; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5320; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
5321; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
5322; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
5323; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
5324; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
5325; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5326; GFX1032-NEXT:    s_mov_b32 s2, -1
5327; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5328; GFX1032-NEXT:    s_endpgm
5329;
5330; GFX1164-LABEL: sub_i64_constant:
5331; GFX1164:       ; %bb.0: ; %entry
5332; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5333; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
5334; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
5335; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
5336; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5337; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
5338; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5339; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
5340; GFX1164-NEXT:    s_cbranch_execz .LBB9_2
5341; GFX1164-NEXT:  ; %bb.1:
5342; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
5343; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5344; GFX1164-NEXT:    s_mul_i32 s6, s6, 5
5345; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
5346; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
5347; GFX1164-NEXT:    s_mov_b32 s10, -1
5348; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5349; GFX1164-NEXT:    s_mov_b32 s8, s2
5350; GFX1164-NEXT:    s_mov_b32 s9, s3
5351; GFX1164-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
5352; GFX1164-NEXT:    s_waitcnt vmcnt(0)
5353; GFX1164-NEXT:    buffer_gl1_inv
5354; GFX1164-NEXT:    buffer_gl0_inv
5355; GFX1164-NEXT:  .LBB9_2:
5356; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5357; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5358; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5359; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
5360; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5361; GFX1164-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
5362; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5363; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
5364; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
5365; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5366; GFX1164-NEXT:    s_mov_b32 s2, -1
5367; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5368; GFX1164-NEXT:    s_endpgm
5369;
5370; GFX1132-LABEL: sub_i64_constant:
5371; GFX1132:       ; %bb.0: ; %entry
5372; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5373; GFX1132-NEXT:    s_mov_b32 s6, exec_lo
5374; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
5375; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
5376; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
5377; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5378; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
5379; GFX1132-NEXT:    s_cbranch_execz .LBB9_2
5380; GFX1132-NEXT:  ; %bb.1:
5381; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s6
5382; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
5383; GFX1132-NEXT:    s_mul_i32 s5, s5, 5
5384; GFX1132-NEXT:    s_mov_b32 s10, -1
5385; GFX1132-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
5386; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5387; GFX1132-NEXT:    s_mov_b32 s8, s2
5388; GFX1132-NEXT:    s_mov_b32 s9, s3
5389; GFX1132-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
5390; GFX1132-NEXT:    s_waitcnt vmcnt(0)
5391; GFX1132-NEXT:    buffer_gl1_inv
5392; GFX1132-NEXT:    buffer_gl0_inv
5393; GFX1132-NEXT:  .LBB9_2:
5394; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5395; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5396; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
5397; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
5398; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
5399; GFX1132-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
5400; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5401; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
5402; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
5403; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5404; GFX1132-NEXT:    s_mov_b32 s2, -1
5405; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5406; GFX1132-NEXT:    s_endpgm
5407;
5408; GFX1264-LABEL: sub_i64_constant:
5409; GFX1264:       ; %bb.0: ; %entry
5410; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5411; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
5412; GFX1264-NEXT:    s_mov_b32 s9, 0
5413; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
5414; GFX1264-NEXT:    s_mov_b64 s[4:5], exec
5415; GFX1264-NEXT:    s_wait_alu 0xfffe
5416; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5417; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
5418; GFX1264-NEXT:    ; implicit-def: $vgpr0_vgpr1
5419; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v2
5420; GFX1264-NEXT:    s_cbranch_execz .LBB9_2
5421; GFX1264-NEXT:  ; %bb.1:
5422; GFX1264-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
5423; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
5424; GFX1264-NEXT:    s_mul_u64 s[6:7], s[8:9], 5
5425; GFX1264-NEXT:    s_mov_b32 s10, -1
5426; GFX1264-NEXT:    s_wait_alu 0xfffe
5427; GFX1264-NEXT:    v_mov_b32_e32 v0, s6
5428; GFX1264-NEXT:    v_mov_b32_e32 v1, s7
5429; GFX1264-NEXT:    s_wait_kmcnt 0x0
5430; GFX1264-NEXT:    s_mov_b32 s8, s2
5431; GFX1264-NEXT:    s_mov_b32 s9, s3
5432; GFX1264-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
5433; GFX1264-NEXT:    s_wait_loadcnt 0x0
5434; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
5435; GFX1264-NEXT:  .LBB9_2:
5436; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
5437; GFX1264-NEXT:    s_wait_kmcnt 0x0
5438; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
5439; GFX1264-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
5440; GFX1264-NEXT:    v_readfirstlane_b32 s3, v1
5441; GFX1264-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
5442; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5443; GFX1264-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
5444; GFX1264-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
5445; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
5446; GFX1264-NEXT:    s_mov_b32 s2, -1
5447; GFX1264-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
5448; GFX1264-NEXT:    s_endpgm
5449;
5450; GFX1232-LABEL: sub_i64_constant:
5451; GFX1232:       ; %bb.0: ; %entry
5452; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5453; GFX1232-NEXT:    s_mov_b32 s7, exec_lo
5454; GFX1232-NEXT:    s_mov_b32 s5, 0
5455; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v2, s7, 0
5456; GFX1232-NEXT:    s_mov_b32 s6, exec_lo
5457; GFX1232-NEXT:    ; implicit-def: $vgpr0_vgpr1
5458; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5459; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v2
5460; GFX1232-NEXT:    s_cbranch_execz .LBB9_2
5461; GFX1232-NEXT:  ; %bb.1:
5462; GFX1232-NEXT:    s_wait_alu 0xfffe
5463; GFX1232-NEXT:    s_bcnt1_i32_b32 s4, s7
5464; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
5465; GFX1232-NEXT:    s_mul_u64 s[4:5], s[4:5], 5
5466; GFX1232-NEXT:    s_mov_b32 s10, -1
5467; GFX1232-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
5468; GFX1232-NEXT:    s_wait_kmcnt 0x0
5469; GFX1232-NEXT:    s_mov_b32 s8, s2
5470; GFX1232-NEXT:    s_mov_b32 s9, s3
5471; GFX1232-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
5472; GFX1232-NEXT:    s_wait_loadcnt 0x0
5473; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
5474; GFX1232-NEXT:  .LBB9_2:
5475; GFX1232-NEXT:    s_wait_alu 0xfffe
5476; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s6
5477; GFX1232-NEXT:    s_wait_kmcnt 0x0
5478; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
5479; GFX1232-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
5480; GFX1232-NEXT:    v_readfirstlane_b32 s3, v1
5481; GFX1232-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
5482; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5483; GFX1232-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
5484; GFX1232-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
5485; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
5486; GFX1232-NEXT:    s_mov_b32 s2, -1
5487; GFX1232-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
5488; GFX1232-NEXT:    s_endpgm
5489entry:
5490  %old = atomicrmw sub ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel
5491  store i64 %old, ptr addrspace(1) %out
5492  ret void
5493}
5494
5495define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 %subitive) {
5496; GFX7LESS-LABEL: sub_i64_uniform:
5497; GFX7LESS:       ; %bb.0: ; %entry
5498; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
5499; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5500; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
5501; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
5502; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s9, v0
5503; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5504; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
5505; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
5506; GFX7LESS-NEXT:    s_cbranch_execz .LBB10_2
5507; GFX7LESS-NEXT:  ; %bb.1:
5508; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
5509; GFX7LESS-NEXT:    s_mov_b32 s14, -1
5510; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5511; GFX7LESS-NEXT:    s_mov_b32 s12, s2
5512; GFX7LESS-NEXT:    s_mov_b32 s13, s3
5513; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
5514; GFX7LESS-NEXT:    s_mul_i32 s3, s5, s2
5515; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
5516; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s4, v0
5517; GFX7LESS-NEXT:    s_mul_i32 s2, s4, s2
5518; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s3, v0
5519; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
5520; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
5521; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
5522; GFX7LESS-NEXT:    buffer_wbinvl1
5523; GFX7LESS-NEXT:  .LBB10_2:
5524; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
5525; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5526; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
5527; GFX7LESS-NEXT:    s_mov_b32 s2, -1
5528; GFX7LESS-NEXT:    v_readfirstlane_b32 s6, v1
5529; GFX7LESS-NEXT:    v_readfirstlane_b32 s7, v0
5530; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
5531; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s5, v2
5532; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s4, v2
5533; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s4, v2
5534; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
5535; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s6
5536; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s7, v2
5537; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
5538; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5539; GFX7LESS-NEXT:    s_endpgm
5540;
5541; GFX8-LABEL: sub_i64_uniform:
5542; GFX8:       ; %bb.0: ; %entry
5543; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5544; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
5545; GFX8-NEXT:    s_mov_b64 s[8:9], exec
5546; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
5547; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
5548; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5549; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5550; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
5551; GFX8-NEXT:    s_cbranch_execz .LBB10_2
5552; GFX8-NEXT:  ; %bb.1:
5553; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5554; GFX8-NEXT:    s_mov_b32 s12, s2
5555; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
5556; GFX8-NEXT:    v_mov_b32_e32 v0, s2
5557; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], s4, v0, 0
5558; GFX8-NEXT:    s_mul_i32 s2, s5, s2
5559; GFX8-NEXT:    s_mov_b32 s15, 0xf000
5560; GFX8-NEXT:    s_mov_b32 s14, -1
5561; GFX8-NEXT:    s_mov_b32 s13, s3
5562; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
5563; GFX8-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
5564; GFX8-NEXT:    s_waitcnt vmcnt(0)
5565; GFX8-NEXT:    buffer_wbinvl1_vol
5566; GFX8-NEXT:  .LBB10_2:
5567; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
5568; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5569; GFX8-NEXT:    v_mul_lo_u32 v4, s5, v2
5570; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s4, v2, 0
5571; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
5572; GFX8-NEXT:    v_readfirstlane_b32 s5, v0
5573; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v4
5574; GFX8-NEXT:    v_mov_b32_e32 v3, s4
5575; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s5, v2
5576; GFX8-NEXT:    s_mov_b32 s3, 0xf000
5577; GFX8-NEXT:    s_mov_b32 s2, -1
5578; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
5579; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5580; GFX8-NEXT:    s_endpgm
5581;
5582; GFX9-LABEL: sub_i64_uniform:
5583; GFX9:       ; %bb.0: ; %entry
5584; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5585; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5586; GFX9-NEXT:    s_mov_b64 s[8:9], exec
5587; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
5588; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
5589; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5590; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5591; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5592; GFX9-NEXT:    s_cbranch_execz .LBB10_2
5593; GFX9-NEXT:  ; %bb.1:
5594; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5595; GFX9-NEXT:    s_mov_b32 s12, s2
5596; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
5597; GFX9-NEXT:    s_mov_b32 s13, s3
5598; GFX9-NEXT:    s_mul_i32 s3, s7, s2
5599; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s2
5600; GFX9-NEXT:    s_add_i32 s8, s8, s3
5601; GFX9-NEXT:    s_mul_i32 s2, s6, s2
5602; GFX9-NEXT:    s_mov_b32 s15, 0xf000
5603; GFX9-NEXT:    s_mov_b32 s14, -1
5604; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5605; GFX9-NEXT:    v_mov_b32_e32 v1, s8
5606; GFX9-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
5607; GFX9-NEXT:    s_waitcnt vmcnt(0)
5608; GFX9-NEXT:    buffer_wbinvl1_vol
5609; GFX9-NEXT:  .LBB10_2:
5610; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5611; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5612; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[2:3], s6, v2, 0
5613; GFX9-NEXT:    s_mov_b32 s3, 0xf000
5614; GFX9-NEXT:    s_mov_b32 s2, -1
5615; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s7, v2, v[4:5]
5616; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
5617; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
5618; GFX9-NEXT:    v_mov_b32_e32 v1, v4
5619; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5620; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v3
5621; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
5622; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5623; GFX9-NEXT:    s_endpgm
5624;
5625; GFX1064-LABEL: sub_i64_uniform:
5626; GFX1064:       ; %bb.0: ; %entry
5627; GFX1064-NEXT:    s_clause 0x1
5628; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5629; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5630; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
5631; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
5632; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
5633; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5634; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5635; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5636; GFX1064-NEXT:    s_cbranch_execz .LBB10_2
5637; GFX1064-NEXT:  ; %bb.1:
5638; GFX1064-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
5639; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
5640; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5641; GFX1064-NEXT:    s_mul_i32 s9, s7, s8
5642; GFX1064-NEXT:    s_mul_hi_u32 s10, s6, s8
5643; GFX1064-NEXT:    s_mul_i32 s8, s6, s8
5644; GFX1064-NEXT:    s_add_i32 s10, s10, s9
5645; GFX1064-NEXT:    v_mov_b32_e32 v0, s8
5646; GFX1064-NEXT:    v_mov_b32_e32 v1, s10
5647; GFX1064-NEXT:    s_mov_b32 s10, -1
5648; GFX1064-NEXT:    s_mov_b32 s8, s2
5649; GFX1064-NEXT:    s_mov_b32 s9, s3
5650; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
5651; GFX1064-NEXT:    s_waitcnt vmcnt(0)
5652; GFX1064-NEXT:    buffer_gl1_inv
5653; GFX1064-NEXT:    buffer_gl0_inv
5654; GFX1064-NEXT:  .LBB10_2:
5655; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5656; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5657; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5658; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[2:3], s6, v2, 0
5659; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s7, v2, v[4:5]
5660; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5661; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
5662; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
5663; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
5664; GFX1064-NEXT:    s_mov_b32 s2, -1
5665; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
5666; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5667; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5668; GFX1064-NEXT:    s_endpgm
5669;
5670; GFX1032-LABEL: sub_i64_uniform:
5671; GFX1032:       ; %bb.0: ; %entry
5672; GFX1032-NEXT:    s_clause 0x1
5673; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5674; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5675; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
5676; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5677; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s8, 0
5678; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
5679; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
5680; GFX1032-NEXT:    s_cbranch_execz .LBB10_2
5681; GFX1032-NEXT:  ; %bb.1:
5682; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s8
5683; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
5684; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5685; GFX1032-NEXT:    s_mul_i32 s8, s7, s5
5686; GFX1032-NEXT:    s_mul_hi_u32 s9, s6, s5
5687; GFX1032-NEXT:    s_mul_i32 s5, s6, s5
5688; GFX1032-NEXT:    s_add_i32 s9, s9, s8
5689; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
5690; GFX1032-NEXT:    v_mov_b32_e32 v1, s9
5691; GFX1032-NEXT:    s_mov_b32 s10, -1
5692; GFX1032-NEXT:    s_mov_b32 s8, s2
5693; GFX1032-NEXT:    s_mov_b32 s9, s3
5694; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
5695; GFX1032-NEXT:    s_waitcnt vmcnt(0)
5696; GFX1032-NEXT:    buffer_gl1_inv
5697; GFX1032-NEXT:    buffer_gl0_inv
5698; GFX1032-NEXT:  .LBB10_2:
5699; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5700; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5701; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5702; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s2, s6, v2, 0
5703; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
5704; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s2, s7, v2, v[4:5]
5705; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5706; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
5707; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
5708; GFX1032-NEXT:    s_mov_b32 s2, -1
5709; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
5710; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5711; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5712; GFX1032-NEXT:    s_endpgm
5713;
5714; GFX1164-LABEL: sub_i64_uniform:
5715; GFX1164:       ; %bb.0: ; %entry
5716; GFX1164-NEXT:    s_clause 0x1
5717; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5718; GFX1164-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
5719; GFX1164-NEXT:    s_mov_b64 s[8:9], exec
5720; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
5721; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
5722; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5723; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
5724; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5725; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
5726; GFX1164-NEXT:    s_cbranch_execz .LBB10_2
5727; GFX1164-NEXT:  ; %bb.1:
5728; GFX1164-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
5729; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
5730; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5731; GFX1164-NEXT:    s_mul_i32 s9, s5, s8
5732; GFX1164-NEXT:    s_mul_hi_u32 s10, s4, s8
5733; GFX1164-NEXT:    s_mul_i32 s8, s4, s8
5734; GFX1164-NEXT:    s_add_i32 s10, s10, s9
5735; GFX1164-NEXT:    v_mov_b32_e32 v0, s8
5736; GFX1164-NEXT:    v_mov_b32_e32 v1, s10
5737; GFX1164-NEXT:    s_mov_b32 s10, -1
5738; GFX1164-NEXT:    s_mov_b32 s8, s2
5739; GFX1164-NEXT:    s_mov_b32 s9, s3
5740; GFX1164-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
5741; GFX1164-NEXT:    s_waitcnt vmcnt(0)
5742; GFX1164-NEXT:    buffer_gl1_inv
5743; GFX1164-NEXT:    buffer_gl0_inv
5744; GFX1164-NEXT:  .LBB10_2:
5745; GFX1164-NEXT:    s_or_b64 exec, exec, s[6:7]
5746; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5747; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s4, v2, 0
5748; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5749; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5750; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
5751; GFX1164-NEXT:    v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5]
5752; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
5753; GFX1164-NEXT:    s_mov_b32 s2, -1
5754; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
5755; GFX1164-NEXT:    v_mov_b32_e32 v1, v5
5756; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
5757; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5758; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5759; GFX1164-NEXT:    s_endpgm
5760;
5761; GFX1132-LABEL: sub_i64_uniform:
5762; GFX1132:       ; %bb.0: ; %entry
5763; GFX1132-NEXT:    s_clause 0x1
5764; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5765; GFX1132-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
5766; GFX1132-NEXT:    s_mov_b32 s7, exec_lo
5767; GFX1132-NEXT:    s_mov_b32 s6, exec_lo
5768; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s7, 0
5769; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
5770; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5771; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
5772; GFX1132-NEXT:    s_cbranch_execz .LBB10_2
5773; GFX1132-NEXT:  ; %bb.1:
5774; GFX1132-NEXT:    s_bcnt1_i32_b32 s7, s7
5775; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
5776; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5777; GFX1132-NEXT:    s_mul_i32 s8, s5, s7
5778; GFX1132-NEXT:    s_mul_hi_u32 s9, s4, s7
5779; GFX1132-NEXT:    s_mul_i32 s7, s4, s7
5780; GFX1132-NEXT:    s_add_i32 s9, s9, s8
5781; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5782; GFX1132-NEXT:    v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s9
5783; GFX1132-NEXT:    s_mov_b32 s10, -1
5784; GFX1132-NEXT:    s_mov_b32 s8, s2
5785; GFX1132-NEXT:    s_mov_b32 s9, s3
5786; GFX1132-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
5787; GFX1132-NEXT:    s_waitcnt vmcnt(0)
5788; GFX1132-NEXT:    buffer_gl1_inv
5789; GFX1132-NEXT:    buffer_gl0_inv
5790; GFX1132-NEXT:  .LBB10_2:
5791; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s6
5792; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5793; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s4, v2, 0
5794; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
5795; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
5796; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
5797; GFX1132-NEXT:    v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5]
5798; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
5799; GFX1132-NEXT:    s_mov_b32 s2, -1
5800; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
5801; GFX1132-NEXT:    v_mov_b32_e32 v1, v5
5802; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
5803; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5804; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5805; GFX1132-NEXT:    s_endpgm
5806;
5807; GFX1264-LABEL: sub_i64_uniform:
5808; GFX1264:       ; %bb.0: ; %entry
5809; GFX1264-NEXT:    s_clause 0x1
5810; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5811; GFX1264-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
5812; GFX1264-NEXT:    s_mov_b64 s[8:9], exec
5813; GFX1264-NEXT:    s_mov_b32 s11, 0
5814; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
5815; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
5816; GFX1264-NEXT:    s_wait_alu 0xfffe
5817; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5818; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
5819; GFX1264-NEXT:    ; implicit-def: $vgpr0_vgpr1
5820; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v2
5821; GFX1264-NEXT:    s_cbranch_execz .LBB10_2
5822; GFX1264-NEXT:  ; %bb.1:
5823; GFX1264-NEXT:    s_bcnt1_i32_b64 s10, s[8:9]
5824; GFX1264-NEXT:    s_wait_kmcnt 0x0
5825; GFX1264-NEXT:    s_mul_u64 s[8:9], s[4:5], s[10:11]
5826; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
5827; GFX1264-NEXT:    s_wait_alu 0xfffe
5828; GFX1264-NEXT:    v_mov_b32_e32 v0, s8
5829; GFX1264-NEXT:    v_mov_b32_e32 v1, s9
5830; GFX1264-NEXT:    s_mov_b32 s10, -1
5831; GFX1264-NEXT:    s_mov_b32 s8, s2
5832; GFX1264-NEXT:    s_mov_b32 s9, s3
5833; GFX1264-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
5834; GFX1264-NEXT:    s_wait_loadcnt 0x0
5835; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
5836; GFX1264-NEXT:  .LBB10_2:
5837; GFX1264-NEXT:    s_or_b64 exec, exec, s[6:7]
5838; GFX1264-NEXT:    s_wait_kmcnt 0x0
5839; GFX1264-NEXT:    v_mad_co_u64_u32 v[3:4], null, s4, v2, 0
5840; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
5841; GFX1264-NEXT:    v_readfirstlane_b32 s3, v1
5842; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
5843; GFX1264-NEXT:    v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5]
5844; GFX1264-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
5845; GFX1264-NEXT:    s_mov_b32 s2, -1
5846; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
5847; GFX1264-NEXT:    v_mov_b32_e32 v1, v4
5848; GFX1264-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
5849; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
5850; GFX1264-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
5851; GFX1264-NEXT:    s_endpgm
5852;
5853; GFX1232-LABEL: sub_i64_uniform:
5854; GFX1232:       ; %bb.0: ; %entry
5855; GFX1232-NEXT:    s_clause 0x1
5856; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5857; GFX1232-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
5858; GFX1232-NEXT:    s_mov_b32 s6, exec_lo
5859; GFX1232-NEXT:    s_mov_b32 s7, 0
5860; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
5861; GFX1232-NEXT:    s_mov_b32 s8, exec_lo
5862; GFX1232-NEXT:    ; implicit-def: $vgpr0_vgpr1
5863; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5864; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v2
5865; GFX1232-NEXT:    s_cbranch_execz .LBB10_2
5866; GFX1232-NEXT:  ; %bb.1:
5867; GFX1232-NEXT:    s_wait_alu 0xfffe
5868; GFX1232-NEXT:    s_bcnt1_i32_b32 s6, s6
5869; GFX1232-NEXT:    s_mov_b32 s15, 0x31016000
5870; GFX1232-NEXT:    s_wait_kmcnt 0x0
5871; GFX1232-NEXT:    s_wait_alu 0xfffe
5872; GFX1232-NEXT:    s_mul_u64 s[6:7], s[4:5], s[6:7]
5873; GFX1232-NEXT:    s_mov_b32 s14, -1
5874; GFX1232-NEXT:    s_wait_alu 0xfffe
5875; GFX1232-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
5876; GFX1232-NEXT:    s_mov_b32 s12, s2
5877; GFX1232-NEXT:    s_mov_b32 s13, s3
5878; GFX1232-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
5879; GFX1232-NEXT:    s_wait_loadcnt 0x0
5880; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
5881; GFX1232-NEXT:  .LBB10_2:
5882; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s8
5883; GFX1232-NEXT:    s_wait_kmcnt 0x0
5884; GFX1232-NEXT:    v_mad_co_u64_u32 v[3:4], null, s4, v2, 0
5885; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
5886; GFX1232-NEXT:    v_readfirstlane_b32 s3, v1
5887; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
5888; GFX1232-NEXT:    v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5]
5889; GFX1232-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
5890; GFX1232-NEXT:    s_mov_b32 s2, -1
5891; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
5892; GFX1232-NEXT:    v_mov_b32_e32 v1, v4
5893; GFX1232-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
5894; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
5895; GFX1232-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
5896; GFX1232-NEXT:    s_endpgm
5897entry:
5898  %old = atomicrmw sub ptr addrspace(1) %inout, i64 %subitive syncscope("agent") acq_rel
5899  store i64 %old, ptr addrspace(1) %out
5900  ret void
5901}
5902
5903define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
5904; GFX7LESS_ITERATIVE-LABEL: sub_i64_varying:
5905; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
5906; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
5907; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
5908; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
5909; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
5910; GFX7LESS_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
5911; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
5912; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
5913; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s2
5914; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s2
5915; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s2
5916; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, m0
5917; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
5918; GFX7LESS_ITERATIVE-NEXT:    s_add_u32 s6, s6, s8
5919; GFX7LESS_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
5920; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
5921; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
5922; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
5923; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
5924; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB11_1
5925; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
5926; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5927; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
5928; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
5929; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5930; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
5931; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5932; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5933; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
5934; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
5935; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
5936; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s10, -1
5937; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5938; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s8, s2
5939; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s9, s3
5940; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s6
5941; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s7
5942; GFX7LESS_ITERATIVE-NEXT:    buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
5943; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
5944; GFX7LESS_ITERATIVE-NEXT:    buffer_wbinvl1
5945; GFX7LESS_ITERATIVE-NEXT:  .LBB11_4:
5946; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
5947; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5948; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
5949; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
5950; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
5951; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
5952; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt expcnt(0)
5953; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
5954; GFX7LESS_ITERATIVE-NEXT:    v_sub_i32_e32 v0, vcc, s5, v1
5955; GFX7LESS_ITERATIVE-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
5956; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5957; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
5958;
5959; GFX8_ITERATIVE-LABEL: sub_i64_varying:
5960; GFX8_ITERATIVE:       ; %bb.0: ; %entry
5961; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
5962; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
5963; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
5964; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
5965; GFX8_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
5966; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
5967; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
5968; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s2
5969; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s2
5970; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s2
5971; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
5972; GFX8_ITERATIVE-NEXT:    s_add_u32 s6, s6, s8
5973; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, m0
5974; GFX8_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
5975; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
5976; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
5977; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
5978; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
5979; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
5980; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5981; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5982; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5983; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5984; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
5985; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5986; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5987; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
5988; GFX8_ITERATIVE-NEXT:  ; %bb.3:
5989; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s6
5990; GFX8_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
5991; GFX8_ITERATIVE-NEXT:    s_mov_b32 s10, -1
5992; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5993; GFX8_ITERATIVE-NEXT:    s_mov_b32 s8, s2
5994; GFX8_ITERATIVE-NEXT:    s_mov_b32 s9, s3
5995; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s7
5996; GFX8_ITERATIVE-NEXT:    buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
5997; GFX8_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
5998; GFX8_ITERATIVE-NEXT:    buffer_wbinvl1_vol
5999; GFX8_ITERATIVE-NEXT:  .LBB11_4:
6000; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
6001; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
6002; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
6003; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
6004; GFX8_ITERATIVE-NEXT:    v_sub_u32_e32 v0, vcc, s5, v1
6005; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6006; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
6007; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6008; GFX8_ITERATIVE-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
6009; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6010; GFX8_ITERATIVE-NEXT:    s_endpgm
6011;
6012; GFX9_ITERATIVE-LABEL: sub_i64_varying:
6013; GFX9_ITERATIVE:       ; %bb.0: ; %entry
6014; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
6015; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
6016; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
6017; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
6018; GFX9_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
6019; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6020; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
6021; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s2
6022; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s2
6023; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s2
6024; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
6025; GFX9_ITERATIVE-NEXT:    s_add_u32 s6, s6, s8
6026; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, m0
6027; GFX9_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
6028; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
6029; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
6030; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
6031; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
6032; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6033; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6034; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6035; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6036; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6037; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
6038; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6039; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
6040; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
6041; GFX9_ITERATIVE-NEXT:  ; %bb.3:
6042; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s6
6043; GFX9_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
6044; GFX9_ITERATIVE-NEXT:    s_mov_b32 s10, -1
6045; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6046; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s2
6047; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s3
6048; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s7
6049; GFX9_ITERATIVE-NEXT:    buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
6050; GFX9_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
6051; GFX9_ITERATIVE-NEXT:    buffer_wbinvl1_vol
6052; GFX9_ITERATIVE-NEXT:  .LBB11_4:
6053; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
6054; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
6055; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
6056; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
6057; GFX9_ITERATIVE-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v1
6058; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6059; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
6060; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6061; GFX9_ITERATIVE-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v2, vcc
6062; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6063; GFX9_ITERATIVE-NEXT:    s_endpgm
6064;
6065; GFX1064_ITERATIVE-LABEL: sub_i64_varying:
6066; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
6067; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
6068; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
6069; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
6070; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
6071; GFX1064_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
6072; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6073; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
6074; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
6075; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s2
6076; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
6077; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s2
6078; GFX1064_ITERATIVE-NEXT:    s_add_u32 s6, s6, s3
6079; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s8
6080; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
6081; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
6082; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
6083; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
6084; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6085; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6086; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6087; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
6088; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6089; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6090; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6091; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
6092; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
6093; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
6094; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s6
6095; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s7
6096; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
6097; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s10, -1
6098; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6099; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s2
6100; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s3
6101; GFX1064_ITERATIVE-NEXT:    buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
6102; GFX1064_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
6103; GFX1064_ITERATIVE-NEXT:    buffer_gl1_inv
6104; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
6105; GFX1064_ITERATIVE-NEXT:  .LBB11_4:
6106; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
6107; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
6108; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6109; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
6110; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
6111; GFX1064_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc, s2, v1
6112; GFX1064_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
6113; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
6114; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6115; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6116; GFX1064_ITERATIVE-NEXT:    s_endpgm
6117;
6118; GFX1032_ITERATIVE-LABEL: sub_i64_varying:
6119; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
6120; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
6121; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
6122; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
6123; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
6124; GFX1032_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
6125; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6126; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
6127; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
6128; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
6129; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
6130; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s1
6131; GFX1032_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
6132; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
6133; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
6134; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
6135; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
6136; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
6137; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6138; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6139; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6140; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
6141; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6142; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s4, vcc_lo
6143; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s4, exec_lo, s4
6144; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
6145; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
6146; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s6
6147; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s7
6148; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
6149; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s10, -1
6150; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6151; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s2
6152; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s3
6153; GFX1032_ITERATIVE-NEXT:    buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
6154; GFX1032_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
6155; GFX1032_ITERATIVE-NEXT:    buffer_gl1_inv
6156; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
6157; GFX1032_ITERATIVE-NEXT:  .LBB11_4:
6158; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
6159; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6160; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6161; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
6162; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
6163; GFX1032_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v1
6164; GFX1032_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
6165; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
6166; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6167; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6168; GFX1032_ITERATIVE-NEXT:    s_endpgm
6169;
6170; GFX1164_ITERATIVE-LABEL: sub_i64_varying:
6171; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
6172; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
6173; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
6174; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
6175; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
6176; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
6177; GFX1164_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
6178; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6179; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
6180; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
6181; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s3, v2, s2
6182; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s2
6183; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
6184; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s2
6185; GFX1164_ITERATIVE-NEXT:    s_add_u32 s6, s6, s3
6186; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
6187; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s8
6188; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
6189; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
6190; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6191; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
6192; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
6193; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6194; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
6195; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
6196; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6197; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
6198; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
6199; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
6200; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6201; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6202; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
6203; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
6204; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
6205; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s6
6206; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s7
6207; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
6208; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s10, -1
6209; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6210; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s8, s2
6211; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s9, s3
6212; GFX1164_ITERATIVE-NEXT:    buffer_atomic_sub_u64 v[2:3], off, s[8:11], 0 glc
6213; GFX1164_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
6214; GFX1164_ITERATIVE-NEXT:    buffer_gl1_inv
6215; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
6216; GFX1164_ITERATIVE-NEXT:  .LBB11_4:
6217; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
6218; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6219; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
6220; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
6221; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6222; GFX1164_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
6223; GFX1164_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
6224; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
6225; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6226; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6227; GFX1164_ITERATIVE-NEXT:    s_endpgm
6228;
6229; GFX1132_ITERATIVE-LABEL: sub_i64_varying:
6230; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
6231; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
6232; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
6233; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
6234; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
6235; GFX1132_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
6236; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6237; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
6238; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
6239; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
6240; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
6241; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
6242; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
6243; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
6244; GFX1132_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
6245; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
6246; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
6247; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
6248; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
6249; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
6250; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
6251; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6252; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
6253; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
6254; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
6255; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
6256; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
6257; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s4, vcc_lo
6258; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s4, exec_lo, s4
6259; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
6260; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
6261; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
6262; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
6263; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s10, -1
6264; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6265; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s8, s2
6266; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s9, s3
6267; GFX1132_ITERATIVE-NEXT:    buffer_atomic_sub_u64 v[2:3], off, s[8:11], 0 glc
6268; GFX1132_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
6269; GFX1132_ITERATIVE-NEXT:    buffer_gl1_inv
6270; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
6271; GFX1132_ITERATIVE-NEXT:  .LBB11_4:
6272; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6273; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6274; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
6275; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
6276; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6277; GFX1132_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
6278; GFX1132_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
6279; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
6280; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6281; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
6282; GFX1132_ITERATIVE-NEXT:    s_endpgm
6283;
6284; GFX1264_ITERATIVE-LABEL: sub_i64_varying:
6285; GFX1264_ITERATIVE:       ; %bb.0: ; %entry
6286; GFX1264_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
6287; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
6288; GFX1264_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
6289; GFX1264_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
6290; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
6291; GFX1264_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
6292; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6293; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[0:1]
6294; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
6295; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s10
6296; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s10
6297; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
6298; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s10
6299; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s10
6300; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[8:9]
6301; GFX1264_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
6302; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
6303; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
6304; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6305; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
6306; GFX1264_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
6307; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6308; GFX1264_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
6309; GFX1264_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
6310; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
6311; GFX1264_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6312; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6313; GFX1264_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
6314; GFX1264_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
6315; GFX1264_ITERATIVE-NEXT:  ; %bb.3:
6316; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
6317; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s6
6318; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s7
6319; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
6320; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s10, -1
6321; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
6322; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s8, s2
6323; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s9, s3
6324; GFX1264_ITERATIVE-NEXT:    buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
6325; GFX1264_ITERATIVE-NEXT:    s_wait_loadcnt 0x0
6326; GFX1264_ITERATIVE-NEXT:    global_inv scope:SCOPE_DEV
6327; GFX1264_ITERATIVE-NEXT:  .LBB11_4:
6328; GFX1264_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
6329; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
6330; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
6331; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
6332; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6333; GFX1264_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
6334; GFX1264_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
6335; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
6336; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6337; GFX1264_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
6338; GFX1264_ITERATIVE-NEXT:    s_endpgm
6339;
6340; GFX1232_ITERATIVE-LABEL: sub_i64_varying:
6341; GFX1232_ITERATIVE:       ; %bb.0: ; %entry
6342; GFX1232_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
6343; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
6344; GFX1232_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
6345; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
6346; GFX1232_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
6347; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6348; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
6349; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
6350; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
6351; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
6352; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
6353; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
6354; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
6355; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
6356; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
6357; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
6358; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
6359; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
6360; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
6361; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6362; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
6363; GFX1232_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
6364; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
6365; GFX1232_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
6366; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
6367; GFX1232_ITERATIVE-NEXT:    s_and_saveexec_b32 s4, vcc_lo
6368; GFX1232_ITERATIVE-NEXT:    s_xor_b32 s4, exec_lo, s4
6369; GFX1232_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
6370; GFX1232_ITERATIVE-NEXT:  ; %bb.3:
6371; GFX1232_ITERATIVE-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
6372; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
6373; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s10, -1
6374; GFX1232_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
6375; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s8, s2
6376; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s9, s3
6377; GFX1232_ITERATIVE-NEXT:    buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
6378; GFX1232_ITERATIVE-NEXT:    s_wait_loadcnt 0x0
6379; GFX1232_ITERATIVE-NEXT:    global_inv scope:SCOPE_DEV
6380; GFX1232_ITERATIVE-NEXT:  .LBB11_4:
6381; GFX1232_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6382; GFX1232_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
6383; GFX1232_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
6384; GFX1232_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
6385; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6386; GFX1232_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
6387; GFX1232_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
6388; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
6389; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6390; GFX1232_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
6391; GFX1232_ITERATIVE-NEXT:    s_endpgm
6392;
6393; GFX7LESS_DPP-LABEL: sub_i64_varying:
6394; GFX7LESS_DPP:       ; %bb.0: ; %entry
6395; GFX7LESS_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6396; GFX7LESS_DPP-NEXT:    s_mov_b32 s7, 0xf000
6397; GFX7LESS_DPP-NEXT:    s_mov_b32 s6, -1
6398; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
6399; GFX7LESS_DPP-NEXT:    s_mov_b32 s10, s6
6400; GFX7LESS_DPP-NEXT:    s_mov_b32 s11, s7
6401; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6402; GFX7LESS_DPP-NEXT:    s_mov_b32 s8, s2
6403; GFX7LESS_DPP-NEXT:    s_mov_b32 s9, s3
6404; GFX7LESS_DPP-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
6405; GFX7LESS_DPP-NEXT:    s_waitcnt vmcnt(0)
6406; GFX7LESS_DPP-NEXT:    buffer_wbinvl1
6407; GFX7LESS_DPP-NEXT:    s_mov_b32 s4, s0
6408; GFX7LESS_DPP-NEXT:    s_mov_b32 s5, s1
6409; GFX7LESS_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6410; GFX7LESS_DPP-NEXT:    s_endpgm
6411;
6412; GFX8_DPP-LABEL: sub_i64_varying:
6413; GFX8_DPP:       ; %bb.0: ; %entry
6414; GFX8_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6415; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
6416; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
6417; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
6418; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
6419; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v6
6420; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
6421; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[4:5]
6422; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
6423; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s[4:5]
6424; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
6425; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
6426; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
6427; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6428; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
6429; GFX8_DPP-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
6430; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
6431; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
6432; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
6433; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6434; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
6435; GFX8_DPP-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
6436; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
6437; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
6438; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
6439; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6440; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
6441; GFX8_DPP-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
6442; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
6443; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
6444; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
6445; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6446; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
6447; GFX8_DPP-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
6448; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
6449; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
6450; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
6451; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6452; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
6453; GFX8_DPP-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
6454; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
6455; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
6456; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
6457; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6458; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v2, v4, vcc
6459; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
6460; GFX8_DPP-NEXT:    v_readlane_b32 s7, v4, 63
6461; GFX8_DPP-NEXT:    v_readlane_b32 s6, v3, 63
6462; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
6463; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
6464; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
6465; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
6466; GFX8_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
6467; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6468; GFX8_DPP-NEXT:    s_cbranch_execz .LBB11_2
6469; GFX8_DPP-NEXT:  ; %bb.1:
6470; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s6
6471; GFX8_DPP-NEXT:    s_mov_b32 s11, 0xf000
6472; GFX8_DPP-NEXT:    s_mov_b32 s10, -1
6473; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6474; GFX8_DPP-NEXT:    s_mov_b32 s8, s2
6475; GFX8_DPP-NEXT:    s_mov_b32 s9, s3
6476; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s7
6477; GFX8_DPP-NEXT:    buffer_atomic_sub_x2 v[6:7], off, s[8:11], 0 glc
6478; GFX8_DPP-NEXT:    s_waitcnt vmcnt(0)
6479; GFX8_DPP-NEXT:    buffer_wbinvl1_vol
6480; GFX8_DPP-NEXT:  .LBB11_2:
6481; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
6482; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v7
6483; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v6
6484; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v1
6485; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v2
6486; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
6487; GFX8_DPP-NEXT:    v_sub_u32_e32 v6, vcc, s5, v6
6488; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6489; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
6490; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
6491; GFX8_DPP-NEXT:    v_subb_u32_e32 v7, vcc, v0, v7, vcc
6492; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
6493; GFX8_DPP-NEXT:    s_endpgm
6494;
6495; GFX9_DPP-LABEL: sub_i64_varying:
6496; GFX9_DPP:       ; %bb.0: ; %entry
6497; GFX9_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6498; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
6499; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
6500; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
6501; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
6502; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v6
6503; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
6504; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[4:5]
6505; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
6506; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s[4:5]
6507; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
6508; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
6509; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
6510; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6511; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
6512; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
6513; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
6514; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
6515; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
6516; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6517; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
6518; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
6519; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
6520; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
6521; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
6522; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6523; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
6524; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
6525; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
6526; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
6527; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
6528; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6529; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
6530; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
6531; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
6532; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
6533; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
6534; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6535; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
6536; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
6537; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
6538; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
6539; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
6540; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6541; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
6542; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
6543; GFX9_DPP-NEXT:    v_readlane_b32 s7, v4, 63
6544; GFX9_DPP-NEXT:    v_readlane_b32 s6, v3, 63
6545; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
6546; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
6547; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
6548; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
6549; GFX9_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
6550; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6551; GFX9_DPP-NEXT:    s_cbranch_execz .LBB11_2
6552; GFX9_DPP-NEXT:  ; %bb.1:
6553; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s6
6554; GFX9_DPP-NEXT:    s_mov_b32 s11, 0xf000
6555; GFX9_DPP-NEXT:    s_mov_b32 s10, -1
6556; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6557; GFX9_DPP-NEXT:    s_mov_b32 s8, s2
6558; GFX9_DPP-NEXT:    s_mov_b32 s9, s3
6559; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s7
6560; GFX9_DPP-NEXT:    buffer_atomic_sub_x2 v[6:7], off, s[8:11], 0 glc
6561; GFX9_DPP-NEXT:    s_waitcnt vmcnt(0)
6562; GFX9_DPP-NEXT:    buffer_wbinvl1_vol
6563; GFX9_DPP-NEXT:  .LBB11_2:
6564; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
6565; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v7
6566; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v6
6567; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v1
6568; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v2
6569; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
6570; GFX9_DPP-NEXT:    v_sub_co_u32_e32 v6, vcc, s5, v6
6571; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6572; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
6573; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
6574; GFX9_DPP-NEXT:    v_subb_co_u32_e32 v7, vcc, v0, v7, vcc
6575; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
6576; GFX9_DPP-NEXT:    s_endpgm
6577;
6578; GFX1064_DPP-LABEL: sub_i64_varying:
6579; GFX1064_DPP:       ; %bb.0: ; %entry
6580; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
6581; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
6582; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
6583; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
6584; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
6585; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
6586; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
6587; GFX1064_DPP-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6588; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, 0
6589; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
6590; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v2, v1
6591; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc
6592; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
6593; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6594; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
6595; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6596; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v6
6597; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
6598; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
6599; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6600; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
6601; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6602; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v4
6603; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc
6604; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
6605; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6606; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6607; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v6
6608; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
6609; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
6610; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
6611; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v2, -1, -1
6612; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6613; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6614; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v3
6615; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
6616; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
6617; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
6618; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
6619; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 31
6620; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, s2
6621; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s3
6622; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6623; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6624; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
6625; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v3
6626; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
6627; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
6628; GFX1064_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6629; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6630; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
6631; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6632; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6633; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 15
6634; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 15
6635; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v2, 31
6636; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v1, 31
6637; GFX1064_DPP-NEXT:    v_readlane_b32 s10, v1, 47
6638; GFX1064_DPP-NEXT:    v_writelane_b32 v7, s6, 16
6639; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s7, 16
6640; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 63
6641; GFX1064_DPP-NEXT:    v_readlane_b32 s11, v2, 47
6642; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v2, 63
6643; GFX1064_DPP-NEXT:    v_writelane_b32 v7, s8, 32
6644; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s9, 32
6645; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
6646; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6647; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
6648; GFX1064_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
6649; GFX1064_DPP-NEXT:    v_writelane_b32 v7, s11, 48
6650; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s10, 48
6651; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[8:9]
6652; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6653; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
6654; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
6655; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
6656; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB11_2
6657; GFX1064_DPP-NEXT:  ; %bb.1:
6658; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s5
6659; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s4
6660; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
6661; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6662; GFX1064_DPP-NEXT:    s_mov_b32 s4, s2
6663; GFX1064_DPP-NEXT:    s_mov_b32 s5, s3
6664; GFX1064_DPP-NEXT:    buffer_atomic_sub_x2 v[8:9], off, s[4:7], 0 glc
6665; GFX1064_DPP-NEXT:    s_waitcnt vmcnt(0)
6666; GFX1064_DPP-NEXT:    buffer_gl1_inv
6667; GFX1064_DPP-NEXT:    buffer_gl0_inv
6668; GFX1064_DPP-NEXT:  .LBB11_2:
6669; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
6670; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
6671; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6672; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s2, v8
6673; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, v6
6674; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v7
6675; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v9
6676; GFX1064_DPP-NEXT:    v_sub_co_u32 v8, vcc, s2, v10
6677; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
6678; GFX1064_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc
6679; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
6680; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0
6681; GFX1064_DPP-NEXT:    s_endpgm
6682;
6683; GFX1032_DPP-LABEL: sub_i64_varying:
6684; GFX1032_DPP:       ; %bb.0: ; %entry
6685; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
6686; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
6687; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s0
6688; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s0
6689; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
6690; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
6691; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
6692; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6693; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, 0
6694; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
6695; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, 0
6696; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v1
6697; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo
6698; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
6699; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6700; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
6701; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6702; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v6
6703; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
6704; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
6705; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6706; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
6707; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6708; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v4
6709; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo
6710; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
6711; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6712; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6713; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v6
6714; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
6715; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
6716; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
6717; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v2, -1, -1
6718; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6719; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6720; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
6721; GFX1032_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6722; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
6723; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
6724; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
6725; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v1, 31
6726; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6727; GFX1032_DPP-NEXT:    v_readlane_b32 s8, v2, 15
6728; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v2, 31
6729; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6730; GFX1032_DPP-NEXT:    v_readlane_b32 s7, v1, 15
6731; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
6732; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6733; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
6734; GFX1032_DPP-NEXT:    v_writelane_b32 v8, s8, 16
6735; GFX1032_DPP-NEXT:    v_writelane_b32 v7, s7, 16
6736; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
6737; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
6738; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6739; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
6740; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
6741; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB11_2
6742; GFX1032_DPP-NEXT:  ; %bb.1:
6743; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s5
6744; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s4
6745; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
6746; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6747; GFX1032_DPP-NEXT:    s_mov_b32 s4, s2
6748; GFX1032_DPP-NEXT:    s_mov_b32 s5, s3
6749; GFX1032_DPP-NEXT:    buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc
6750; GFX1032_DPP-NEXT:    s_waitcnt vmcnt(0)
6751; GFX1032_DPP-NEXT:    buffer_gl1_inv
6752; GFX1032_DPP-NEXT:    buffer_gl0_inv
6753; GFX1032_DPP-NEXT:  .LBB11_2:
6754; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
6755; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
6756; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6757; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s2, v9
6758; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v7
6759; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v8
6760; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v10
6761; GFX1032_DPP-NEXT:    v_sub_co_u32 v9, vcc_lo, s2, v11
6762; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
6763; GFX1032_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo
6764; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
6765; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
6766; GFX1032_DPP-NEXT:    s_endpgm
6767;
6768; GFX1164_DPP-LABEL: sub_i64_varying:
6769; GFX1164_DPP:       ; %bb.0: ; %entry
6770; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
6771; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
6772; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6773; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
6774; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
6775; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
6776; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
6777; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6778; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6779; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
6780; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6781; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
6782; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
6783; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
6784; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
6785; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6786; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
6787; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6788; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
6789; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
6790; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6791; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
6792; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
6793; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
6794; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
6795; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6796; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6797; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
6798; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6799; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
6800; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6801; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
6802; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
6803; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
6804; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6805; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6806; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
6807; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
6808; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
6809; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6810; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s2
6811; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 31
6812; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6813; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6814; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6815; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
6816; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
6817; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
6818; GFX1164_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
6819; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6820; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
6821; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6822; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 15
6823; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6824; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 15
6825; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v2, 31
6826; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v1, 31
6827; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s6, 16
6828; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 63
6829; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s7, 16
6830; GFX1164_DPP-NEXT:    v_readlane_b32 s10, v2, 47
6831; GFX1164_DPP-NEXT:    v_readlane_b32 s11, v1, 47
6832; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 63
6833; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s8, 32
6834; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s9, 32
6835; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
6836; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6837; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6838; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
6839; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
6840; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s10, 48
6841; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s11, 48
6842; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[8:9]
6843; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
6844; GFX1164_DPP-NEXT:    s_mov_b64 s[8:9], exec
6845; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
6846; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
6847; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB11_2
6848; GFX1164_DPP-NEXT:  ; %bb.1:
6849; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s5
6850; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s4
6851; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
6852; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6853; GFX1164_DPP-NEXT:    s_mov_b32 s4, s2
6854; GFX1164_DPP-NEXT:    s_mov_b32 s5, s3
6855; GFX1164_DPP-NEXT:    buffer_atomic_sub_u64 v[6:7], off, s[4:7], 0 glc
6856; GFX1164_DPP-NEXT:    s_waitcnt vmcnt(0)
6857; GFX1164_DPP-NEXT:    buffer_gl1_inv
6858; GFX1164_DPP-NEXT:    buffer_gl0_inv
6859; GFX1164_DPP-NEXT:  .LBB11_2:
6860; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
6861; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6862; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s2, v6
6863; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v4
6864; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v5
6865; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v7
6866; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6867; GFX1164_DPP-NEXT:    v_sub_co_u32 v6, vcc, s2, v8
6868; GFX1164_DPP-NEXT:    s_mov_b32 s2, s6
6869; GFX1164_DPP-NEXT:    v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc
6870; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
6871; GFX1164_DPP-NEXT:    buffer_store_b64 v[6:7], off, s[0:3], 0
6872; GFX1164_DPP-NEXT:    s_endpgm
6873;
6874; GFX1132_DPP-LABEL: sub_i64_varying:
6875; GFX1132_DPP:       ; %bb.0: ; %entry
6876; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
6877; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
6878; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6879; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s0
6880; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
6881; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s0
6882; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0
6883; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, 0
6884; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
6885; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6886; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
6887; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6888; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
6889; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
6890; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
6891; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
6892; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6893; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
6894; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6895; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
6896; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
6897; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6898; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
6899; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
6900; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
6901; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
6902; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6903; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6904; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
6905; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6906; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
6907; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6908; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
6909; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
6910; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6911; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
6912; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
6913; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
6914; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
6915; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
6916; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v2, 31
6917; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6918; GFX1132_DPP-NEXT:    v_readlane_b32 s7, v2, 15
6919; GFX1132_DPP-NEXT:    v_readlane_b32 s8, v1, 15
6920; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v1, 31
6921; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6922; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
6923; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6924; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6925; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
6926; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s7, 16
6927; GFX1132_DPP-NEXT:    v_writelane_b32 v7, s8, 16
6928; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
6929; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
6930; GFX1132_DPP-NEXT:    s_mov_b32 s8, exec_lo
6931; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
6932; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
6933; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB11_2
6934; GFX1132_DPP-NEXT:  ; %bb.1:
6935; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
6936; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
6937; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6938; GFX1132_DPP-NEXT:    s_mov_b32 s4, s2
6939; GFX1132_DPP-NEXT:    s_mov_b32 s5, s3
6940; GFX1132_DPP-NEXT:    buffer_atomic_sub_u64 v[8:9], off, s[4:7], 0 glc
6941; GFX1132_DPP-NEXT:    s_waitcnt vmcnt(0)
6942; GFX1132_DPP-NEXT:    buffer_gl1_inv
6943; GFX1132_DPP-NEXT:    buffer_gl0_inv
6944; GFX1132_DPP-NEXT:  .LBB11_2:
6945; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
6946; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6947; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s2, v8
6948; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v6
6949; GFX1132_DPP-NEXT:    v_mov_b32_e32 v11, v7
6950; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v9
6951; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6952; GFX1132_DPP-NEXT:    v_sub_co_u32 v8, vcc_lo, s2, v10
6953; GFX1132_DPP-NEXT:    s_mov_b32 s2, s6
6954; GFX1132_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
6955; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
6956; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
6957; GFX1132_DPP-NEXT:    s_endpgm
6958;
6959; GFX1264_DPP-LABEL: sub_i64_varying:
6960; GFX1264_DPP:       ; %bb.0: ; %entry
6961; GFX1264_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
6962; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
6963; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6964; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
6965; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, 0
6966; GFX1264_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
6967; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, 0
6968; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6969; GFX1264_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6970; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
6971; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6972; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
6973; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
6974; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, 0
6975; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
6976; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6977; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
6978; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6979; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
6980; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, 0
6981; GFX1264_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6982; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
6983; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
6984; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
6985; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, 0
6986; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6987; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6988; GFX1264_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
6989; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6990; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
6991; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6992; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, 0
6993; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
6994; GFX1264_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
6995; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6996; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6997; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
6998; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, 0
6999; GFX1264_DPP-NEXT:    v_readlane_b32 s2, v1, 31
7000; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
7001; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, s2
7002; GFX1264_DPP-NEXT:    v_readlane_b32 s2, v2, 31
7003; GFX1264_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
7004; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
7005; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
7006; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, 0
7007; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
7008; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[0:1]
7009; GFX1264_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
7010; GFX1264_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7011; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
7012; GFX1264_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
7013; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v2, 15
7014; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
7015; GFX1264_DPP-NEXT:    v_readlane_b32 s7, v1, 15
7016; GFX1264_DPP-NEXT:    v_readlane_b32 s8, v2, 31
7017; GFX1264_DPP-NEXT:    v_readlane_b32 s9, v1, 31
7018; GFX1264_DPP-NEXT:    v_writelane_b32 v4, s6, 16
7019; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v2, 63
7020; GFX1264_DPP-NEXT:    v_writelane_b32 v5, s7, 16
7021; GFX1264_DPP-NEXT:    v_readlane_b32 s10, v2, 47
7022; GFX1264_DPP-NEXT:    v_readlane_b32 s11, v1, 47
7023; GFX1264_DPP-NEXT:    v_readlane_b32 s7, v1, 63
7024; GFX1264_DPP-NEXT:    v_writelane_b32 v4, s8, 32
7025; GFX1264_DPP-NEXT:    v_writelane_b32 v5, s9, 32
7026; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[4:5]
7027; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7028; GFX1264_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
7029; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
7030; GFX1264_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
7031; GFX1264_DPP-NEXT:    v_writelane_b32 v4, s10, 48
7032; GFX1264_DPP-NEXT:    v_writelane_b32 v5, s11, 48
7033; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
7034; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[8:9]
7035; GFX1264_DPP-NEXT:    s_mov_b32 s6, -1
7036; GFX1264_DPP-NEXT:    s_mov_b64 s[8:9], exec
7037; GFX1264_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
7038; GFX1264_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
7039; GFX1264_DPP-NEXT:    s_cbranch_execz .LBB11_2
7040; GFX1264_DPP-NEXT:  ; %bb.1:
7041; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, s5
7042; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, s4
7043; GFX1264_DPP-NEXT:    s_mov_b32 s7, 0x31016000
7044; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
7045; GFX1264_DPP-NEXT:    s_mov_b32 s4, s2
7046; GFX1264_DPP-NEXT:    s_mov_b32 s5, s3
7047; GFX1264_DPP-NEXT:    buffer_atomic_sub_u64 v[6:7], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
7048; GFX1264_DPP-NEXT:    s_wait_loadcnt 0x0
7049; GFX1264_DPP-NEXT:    global_inv scope:SCOPE_DEV
7050; GFX1264_DPP-NEXT:  .LBB11_2:
7051; GFX1264_DPP-NEXT:    s_wait_alu 0xfffe
7052; GFX1264_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
7053; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
7054; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s2, v6
7055; GFX1264_DPP-NEXT:    v_mov_b32_e32 v8, v4
7056; GFX1264_DPP-NEXT:    v_mov_b32_e32 v9, v5
7057; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s3, v7
7058; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
7059; GFX1264_DPP-NEXT:    v_sub_co_u32 v6, vcc, s2, v8
7060; GFX1264_DPP-NEXT:    s_mov_b32 s2, s6
7061; GFX1264_DPP-NEXT:    v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc
7062; GFX1264_DPP-NEXT:    s_mov_b32 s3, 0x31016000
7063; GFX1264_DPP-NEXT:    buffer_store_b64 v[6:7], off, s[0:3], null
7064; GFX1264_DPP-NEXT:    s_endpgm
7065;
7066; GFX1232_DPP-LABEL: sub_i64_varying:
7067; GFX1232_DPP:       ; %bb.0: ; %entry
7068; GFX1232_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
7069; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s0, -1
7070; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
7071; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s0
7072; GFX1232_DPP-NEXT:    v_mov_b32_e32 v2, 0
7073; GFX1232_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s0
7074; GFX1232_DPP-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0
7075; GFX1232_DPP-NEXT:    v_mov_b32_e32 v6, 0
7076; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
7077; GFX1232_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
7078; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
7079; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7080; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
7081; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
7082; GFX1232_DPP-NEXT:    v_mov_b32_e32 v2, 0
7083; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
7084; GFX1232_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
7085; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
7086; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
7087; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
7088; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, 0
7089; GFX1232_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
7090; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
7091; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
7092; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
7093; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, 0
7094; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
7095; GFX1232_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
7096; GFX1232_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
7097; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7098; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
7099; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
7100; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
7101; GFX1232_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
7102; GFX1232_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
7103; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s0
7104; GFX1232_DPP-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
7105; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s6, -1
7106; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
7107; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
7108; GFX1232_DPP-NEXT:    v_readlane_b32 s4, v2, 31
7109; GFX1232_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
7110; GFX1232_DPP-NEXT:    v_readlane_b32 s7, v2, 15
7111; GFX1232_DPP-NEXT:    v_readlane_b32 s8, v1, 15
7112; GFX1232_DPP-NEXT:    v_readlane_b32 s5, v1, 31
7113; GFX1232_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
7114; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s6
7115; GFX1232_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7116; GFX1232_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7117; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s6, -1
7118; GFX1232_DPP-NEXT:    v_writelane_b32 v6, s7, 16
7119; GFX1232_DPP-NEXT:    v_writelane_b32 v7, s8, 16
7120; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
7121; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s6
7122; GFX1232_DPP-NEXT:    s_mov_b32 s6, -1
7123; GFX1232_DPP-NEXT:    s_mov_b32 s8, exec_lo
7124; GFX1232_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
7125; GFX1232_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
7126; GFX1232_DPP-NEXT:    s_cbranch_execz .LBB11_2
7127; GFX1232_DPP-NEXT:  ; %bb.1:
7128; GFX1232_DPP-NEXT:    v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
7129; GFX1232_DPP-NEXT:    s_mov_b32 s7, 0x31016000
7130; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
7131; GFX1232_DPP-NEXT:    s_mov_b32 s4, s2
7132; GFX1232_DPP-NEXT:    s_mov_b32 s5, s3
7133; GFX1232_DPP-NEXT:    buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
7134; GFX1232_DPP-NEXT:    s_wait_loadcnt 0x0
7135; GFX1232_DPP-NEXT:    global_inv scope:SCOPE_DEV
7136; GFX1232_DPP-NEXT:  .LBB11_2:
7137; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
7138; GFX1232_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
7139; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
7140; GFX1232_DPP-NEXT:    v_readfirstlane_b32 s2, v8
7141; GFX1232_DPP-NEXT:    v_mov_b32_e32 v10, v6
7142; GFX1232_DPP-NEXT:    v_mov_b32_e32 v11, v7
7143; GFX1232_DPP-NEXT:    v_readfirstlane_b32 s3, v9
7144; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
7145; GFX1232_DPP-NEXT:    v_sub_co_u32 v8, vcc_lo, s2, v10
7146; GFX1232_DPP-NEXT:    s_mov_b32 s2, s6
7147; GFX1232_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
7148; GFX1232_DPP-NEXT:    s_mov_b32 s3, 0x31016000
7149; GFX1232_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], null
7150; GFX1232_DPP-NEXT:    s_endpgm
7151entry:
7152  %lane = call i32 @llvm.amdgcn.workitem.id.x()
7153  %zext = zext i32 %lane to i64
7154  %old = atomicrmw sub ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel
7155  store i64 %old, ptr addrspace(1) %out
7156  ret void
7157}
7158