xref: /llvm-project/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (revision 5a3299a684d7d8c40f48d732e5b80a8bd29aa882)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
9; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
10; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
11; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
12; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
13; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
14; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
15; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
16
17declare i32 @llvm.amdgcn.workitem.id.x()
18
19@local_var32 = addrspace(3) global i32 undef, align 4
20@local_var64 = addrspace(3) global i64 undef, align 8
21
22; Show what the atomic optimization pass will do for local pointers.
23
24define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
25; GFX7LESS-LABEL: add_i32_constant:
26; GFX7LESS:       ; %bb.0: ; %entry
27; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
28; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
29; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
30; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
31; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
32; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
33; GFX7LESS-NEXT:    s_cbranch_execz .LBB0_2
34; GFX7LESS-NEXT:  ; %bb.1:
35; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
36; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
37; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
38; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
39; GFX7LESS-NEXT:    s_mov_b32 m0, -1
40; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
41; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX7LESS-NEXT:  .LBB0_2:
43; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
44; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
45; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
46; GFX7LESS-NEXT:    s_mov_b32 s2, -1
47; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
48; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
49; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
50; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
51; GFX7LESS-NEXT:    s_endpgm
52;
53; GFX8-LABEL: add_i32_constant:
54; GFX8:       ; %bb.0: ; %entry
55; GFX8-NEXT:    s_mov_b64 s[2:3], exec
56; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
57; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
58; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
59; GFX8-NEXT:    ; implicit-def: $vgpr1
60; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
61; GFX8-NEXT:    s_cbranch_execz .LBB0_2
62; GFX8-NEXT:  ; %bb.1:
63; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
64; GFX8-NEXT:    s_mul_i32 s2, s2, 5
65; GFX8-NEXT:    v_mov_b32_e32 v1, 0
66; GFX8-NEXT:    v_mov_b32_e32 v2, s2
67; GFX8-NEXT:    s_mov_b32 m0, -1
68; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
69; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
70; GFX8-NEXT:  .LBB0_2:
71; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
72; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
73; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
74; GFX8-NEXT:    s_mov_b32 s3, 0xf000
75; GFX8-NEXT:    s_mov_b32 s2, -1
76; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
77; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
78; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
79; GFX8-NEXT:    s_endpgm
80;
81; GFX9-LABEL: add_i32_constant:
82; GFX9:       ; %bb.0: ; %entry
83; GFX9-NEXT:    s_mov_b64 s[2:3], exec
84; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
85; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
86; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
87; GFX9-NEXT:    ; implicit-def: $vgpr1
88; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
89; GFX9-NEXT:    s_cbranch_execz .LBB0_2
90; GFX9-NEXT:  ; %bb.1:
91; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
92; GFX9-NEXT:    s_mul_i32 s2, s2, 5
93; GFX9-NEXT:    v_mov_b32_e32 v1, 0
94; GFX9-NEXT:    v_mov_b32_e32 v2, s2
95; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
96; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX9-NEXT:  .LBB0_2:
98; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
99; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
100; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
101; GFX9-NEXT:    s_mov_b32 s3, 0xf000
102; GFX9-NEXT:    s_mov_b32 s2, -1
103; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
104; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
105; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
106; GFX9-NEXT:    s_endpgm
107;
108; GFX1064-LABEL: add_i32_constant:
109; GFX1064:       ; %bb.0: ; %entry
110; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
111; GFX1064-NEXT:    ; implicit-def: $vgpr1
112; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
113; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
114; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
115; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
116; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
117; GFX1064-NEXT:  ; %bb.1:
118; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
119; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
120; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
121; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
122; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
123; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
124; GFX1064-NEXT:    buffer_gl0_inv
125; GFX1064-NEXT:  .LBB0_2:
126; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
127; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
128; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
129; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
130; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
131; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
132; GFX1064-NEXT:    s_mov_b32 s2, -1
133; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
135; GFX1064-NEXT:    s_endpgm
136;
137; GFX1032-LABEL: add_i32_constant:
138; GFX1032:       ; %bb.0: ; %entry
139; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
140; GFX1032-NEXT:    ; implicit-def: $vgpr1
141; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
142; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
143; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
144; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
145; GFX1032-NEXT:  ; %bb.1:
146; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
147; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
148; GFX1032-NEXT:    s_mul_i32 s1, s1, 5
149; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
150; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
151; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
152; GFX1032-NEXT:    buffer_gl0_inv
153; GFX1032-NEXT:  .LBB0_2:
154; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
155; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
156; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
157; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
158; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
159; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
160; GFX1032-NEXT:    s_mov_b32 s2, -1
161; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
162; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
163; GFX1032-NEXT:    s_endpgm
164;
165; GFX1164-LABEL: add_i32_constant:
166; GFX1164:       ; %bb.0: ; %entry
167; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
168; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
169; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
170; GFX1164-NEXT:    ; implicit-def: $vgpr1
171; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
172; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
173; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
174; GFX1164-NEXT:    s_cbranch_execz .LBB0_2
175; GFX1164-NEXT:  ; %bb.1:
176; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
177; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
178; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
179; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
180; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
181; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
182; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX1164-NEXT:    buffer_gl0_inv
184; GFX1164-NEXT:  .LBB0_2:
185; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
186; GFX1164-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
187; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
188; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
189; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
190; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
191; GFX1164-NEXT:    s_mov_b32 s2, -1
192; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
193; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
194; GFX1164-NEXT:    s_endpgm
195;
196; GFX1132-LABEL: add_i32_constant:
197; GFX1132:       ; %bb.0: ; %entry
198; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
199; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
200; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
201; GFX1132-NEXT:    ; implicit-def: $vgpr1
202; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
203; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
204; GFX1132-NEXT:    s_cbranch_execz .LBB0_2
205; GFX1132-NEXT:  ; %bb.1:
206; GFX1132-NEXT:    s_bcnt1_i32_b32 s1, s1
207; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
208; GFX1132-NEXT:    s_mul_i32 s1, s1, 5
209; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1
210; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
211; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
212; GFX1132-NEXT:    buffer_gl0_inv
213; GFX1132-NEXT:  .LBB0_2:
214; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s0
215; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
216; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
217; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
218; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
219; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
220; GFX1132-NEXT:    s_mov_b32 s2, -1
221; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
222; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
223; GFX1132-NEXT:    s_endpgm
224entry:
225  %old = atomicrmw add ptr addrspace(3) @local_var32, i32 5 acq_rel
226  store i32 %old, ptr addrspace(1) %out
227  ret void
228}
229
230define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) {
231; GFX7LESS-LABEL: add_i32_uniform:
232; GFX7LESS:       ; %bb.0: ; %entry
233; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
234; GFX7LESS-NEXT:    s_load_dword s6, s[4:5], 0xb
235; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
236; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
237; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
238; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
239; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
240; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
241; GFX7LESS-NEXT:  ; %bb.1:
242; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
243; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
244; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
245; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
246; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
247; GFX7LESS-NEXT:    s_mov_b32 m0, -1
248; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
249; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
250; GFX7LESS-NEXT:  .LBB1_2:
251; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
252; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
253; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
254; GFX7LESS-NEXT:    s_mov_b32 s2, -1
255; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
256; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
258; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
259; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
260; GFX7LESS-NEXT:    s_endpgm
261;
262; GFX8-LABEL: add_i32_uniform:
263; GFX8:       ; %bb.0: ; %entry
264; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x2c
265; GFX8-NEXT:    s_mov_b64 s[2:3], exec
266; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
267; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
268; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
269; GFX8-NEXT:    ; implicit-def: $vgpr1
270; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
271; GFX8-NEXT:    s_cbranch_execz .LBB1_2
272; GFX8-NEXT:  ; %bb.1:
273; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
274; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX8-NEXT:    s_mul_i32 s2, s6, s2
276; GFX8-NEXT:    v_mov_b32_e32 v1, 0
277; GFX8-NEXT:    v_mov_b32_e32 v2, s2
278; GFX8-NEXT:    s_mov_b32 m0, -1
279; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
280; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX8-NEXT:  .LBB1_2:
282; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
283; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
284; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
285; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
286; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
287; GFX8-NEXT:    s_mov_b32 s3, 0xf000
288; GFX8-NEXT:    s_mov_b32 s2, -1
289; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
290; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
291; GFX8-NEXT:    s_endpgm
292;
293; GFX9-LABEL: add_i32_uniform:
294; GFX9:       ; %bb.0: ; %entry
295; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
296; GFX9-NEXT:    s_mov_b64 s[2:3], exec
297; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
298; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
299; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
300; GFX9-NEXT:    ; implicit-def: $vgpr1
301; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
302; GFX9-NEXT:    s_cbranch_execz .LBB1_2
303; GFX9-NEXT:  ; %bb.1:
304; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
305; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
306; GFX9-NEXT:    s_mul_i32 s2, s6, s2
307; GFX9-NEXT:    v_mov_b32_e32 v1, 0
308; GFX9-NEXT:    v_mov_b32_e32 v2, s2
309; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
310; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
311; GFX9-NEXT:  .LBB1_2:
312; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
313; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
314; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
315; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
316; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
317; GFX9-NEXT:    s_mov_b32 s3, 0xf000
318; GFX9-NEXT:    s_mov_b32 s2, -1
319; GFX9-NEXT:    v_add_u32_e32 v0, s4, v0
320; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
321; GFX9-NEXT:    s_endpgm
322;
323; GFX1064-LABEL: add_i32_uniform:
324; GFX1064:       ; %bb.0: ; %entry
325; GFX1064-NEXT:    s_load_dword s6, s[4:5], 0x2c
326; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
327; GFX1064-NEXT:    ; implicit-def: $vgpr1
328; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
329; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
330; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
331; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
332; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
333; GFX1064-NEXT:  ; %bb.1:
334; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
335; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
336; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
337; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
338; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
339; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
340; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
341; GFX1064-NEXT:    buffer_gl0_inv
342; GFX1064-NEXT:  .LBB1_2:
343; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
344; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
345; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
346; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
347; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
348; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
349; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
350; GFX1064-NEXT:    s_mov_b32 s2, -1
351; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
352; GFX1064-NEXT:    s_endpgm
353;
354; GFX1032-LABEL: add_i32_uniform:
355; GFX1032:       ; %bb.0: ; %entry
356; GFX1032-NEXT:    s_load_dword s0, s[4:5], 0x2c
357; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
358; GFX1032-NEXT:    ; implicit-def: $vgpr1
359; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
360; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
361; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
362; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
363; GFX1032-NEXT:  ; %bb.1:
364; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
365; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
366; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
367; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
368; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
369; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
370; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
371; GFX1032-NEXT:    buffer_gl0_inv
372; GFX1032-NEXT:  .LBB1_2:
373; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
374; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
375; GFX1032-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
376; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
377; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
378; GFX1032-NEXT:    s_mov_b32 s10, -1
379; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
380; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
381; GFX1032-NEXT:    buffer_store_dword v0, off, s[8:11], 0
382; GFX1032-NEXT:    s_endpgm
383;
384; GFX1164-LABEL: add_i32_uniform:
385; GFX1164:       ; %bb.0: ; %entry
386; GFX1164-NEXT:    s_load_b32 s6, s[4:5], 0x2c
387; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
388; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
389; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
390; GFX1164-NEXT:    ; implicit-def: $vgpr1
391; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
392; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
393; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
394; GFX1164-NEXT:    s_cbranch_execz .LBB1_2
395; GFX1164-NEXT:  ; %bb.1:
396; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
397; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
398; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
400; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
401; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
402; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
403; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
404; GFX1164-NEXT:    buffer_gl0_inv
405; GFX1164-NEXT:  .LBB1_2:
406; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
407; GFX1164-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
408; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
409; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
410; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
411; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
412; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
413; GFX1164-NEXT:    s_mov_b32 s2, -1
414; GFX1164-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
415; GFX1164-NEXT:    s_endpgm
416;
417; GFX1132-LABEL: add_i32_uniform:
418; GFX1132:       ; %bb.0: ; %entry
419; GFX1132-NEXT:    s_load_b32 s0, s[4:5], 0x2c
420; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
421; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
422; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
423; GFX1132-NEXT:    ; implicit-def: $vgpr1
424; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
425; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
426; GFX1132-NEXT:    s_cbranch_execz .LBB1_2
427; GFX1132-NEXT:  ; %bb.1:
428; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
429; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
430; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
431; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
432; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
433; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
434; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
435; GFX1132-NEXT:    buffer_gl0_inv
436; GFX1132-NEXT:  .LBB1_2:
437; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
438; GFX1132-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
439; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
440; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
441; GFX1132-NEXT:    s_mov_b32 s6, -1
442; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
443; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
444; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
445; GFX1132-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
446; GFX1132-NEXT:    s_endpgm
447entry:
448  %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %additive acq_rel
449  store i32 %old, ptr addrspace(1) %out
450  ret void
451}
452
453define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
454; GFX7LESS_ITERATIVE-LABEL: add_i32_varying:
455; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
456; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
457; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, 0
458; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
459; GFX7LESS_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
460; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
461; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
462; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s3
463; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
464; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
465; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
466; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
467; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
468; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
469; GFX7LESS_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
470; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB2_1
471; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
472; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
473; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
474; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
475; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
476; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
477; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
478; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
479; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
480; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
481; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
482; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
483; GFX7LESS_ITERATIVE-NEXT:    ds_add_rtn_u32 v0, v0, v2
484; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
485; GFX7LESS_ITERATIVE-NEXT:  .LBB2_4:
486; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
487; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
488; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
489; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
490; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
491; GFX7LESS_ITERATIVE-NEXT:    v_add_i32_e32 v0, vcc, s4, v1
492; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
493; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
494; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
495;
496; GFX8_ITERATIVE-LABEL: add_i32_varying:
497; GFX8_ITERATIVE:       ; %bb.0: ; %entry
498; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
499; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, 0
500; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
501; GFX8_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
502; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
503; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
504; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
505; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
506; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
507; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
508; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
509; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
510; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
511; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
512; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
513; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
514; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
515; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
516; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
517; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
518; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
519; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
520; GFX8_ITERATIVE-NEXT:  ; %bb.3:
521; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
522; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
523; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
524; GFX8_ITERATIVE-NEXT:    ds_add_rtn_u32 v0, v0, v2
525; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
526; GFX8_ITERATIVE-NEXT:  .LBB2_4:
527; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
528; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
529; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
530; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
531; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
532; GFX8_ITERATIVE-NEXT:    v_add_u32_e32 v0, vcc, s4, v1
533; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
534; GFX8_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
535; GFX8_ITERATIVE-NEXT:    s_endpgm
536;
537; GFX9_ITERATIVE-LABEL: add_i32_varying:
538; GFX9_ITERATIVE:       ; %bb.0: ; %entry
539; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
540; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, 0
541; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
542; GFX9_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
543; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
544; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
545; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
546; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
547; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
548; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
549; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
550; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
551; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
552; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
553; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
554; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
555; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
556; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
557; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
558; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
559; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
560; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
561; GFX9_ITERATIVE-NEXT:  ; %bb.3:
562; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
563; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
564; GFX9_ITERATIVE-NEXT:    ds_add_rtn_u32 v0, v0, v2
565; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
566; GFX9_ITERATIVE-NEXT:  .LBB2_4:
567; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
568; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
569; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
570; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
571; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
572; GFX9_ITERATIVE-NEXT:    v_add_u32_e32 v0, s4, v1
573; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
574; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
575; GFX9_ITERATIVE-NEXT:    s_endpgm
576;
577; GFX1064_ITERATIVE-LABEL: add_i32_varying:
578; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
579; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
580; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, 0
581; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
582; GFX1064_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
583; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
584; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
585; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
586; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
587; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
588; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
589; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
590; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
591; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
592; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
593; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
594; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
595; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
596; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
597; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
598; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
599; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
600; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
601; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
602; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
603; GFX1064_ITERATIVE-NEXT:    ds_add_rtn_u32 v0, v0, v2
604; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
605; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
606; GFX1064_ITERATIVE-NEXT:  .LBB2_4:
607; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
608; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
609; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
610; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
611; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
612; GFX1064_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v1
613; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
614; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
615; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
616; GFX1064_ITERATIVE-NEXT:    s_endpgm
617;
618; GFX1032_ITERATIVE-LABEL: add_i32_varying:
619; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
620; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
621; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, 0
622; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
623; GFX1032_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
624; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
625; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
626; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
627; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
628; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
629; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
630; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
631; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
632; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
633; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
634; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
635; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
636; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
637; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
638; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
639; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
640; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
641; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
642; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
643; GFX1032_ITERATIVE-NEXT:    ds_add_rtn_u32 v0, v0, v2
644; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
645; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
646; GFX1032_ITERATIVE-NEXT:  .LBB2_4:
647; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
648; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
649; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
650; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
651; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
652; GFX1032_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v1
653; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
654; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
655; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
656; GFX1032_ITERATIVE-NEXT:    s_endpgm
657;
658; GFX1164_ITERATIVE-LABEL: add_i32_varying:
659; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
660; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
661; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
662; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, 0
663; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
664; GFX1164_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
665; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
666; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
667; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
668; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
669; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
670; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
671; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
672; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
673; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
674; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
675; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
676; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
677; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
678; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
679; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
680; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
681; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
682; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
683; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
684; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
685; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
686; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
687; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
688; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
689; GFX1164_ITERATIVE-NEXT:    ds_add_rtn_u32 v1, v1, v2
690; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
691; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
692; GFX1164_ITERATIVE-NEXT:  .LBB2_4:
693; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
694; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
695; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
696; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
697; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
698; GFX1164_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
699; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
700; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
701; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
702; GFX1164_ITERATIVE-NEXT:    s_endpgm
703;
704; GFX1132_ITERATIVE-LABEL: add_i32_varying:
705; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
706; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
707; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
708; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, 0
709; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
710; GFX1132_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
711; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
712; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
713; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
714; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
715; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
716; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
717; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
718; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
719; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
720; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
721; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
722; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
723; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
724; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
725; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
726; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
727; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
728; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
729; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
730; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
731; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
732; GFX1132_ITERATIVE-NEXT:    ds_add_rtn_u32 v1, v1, v2
733; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
734; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
735; GFX1132_ITERATIVE-NEXT:  .LBB2_4:
736; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
737; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
738; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
739; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
740; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
741; GFX1132_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
742; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
743; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
744; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
745; GFX1132_ITERATIVE-NEXT:    s_endpgm
746;
747; GFX7LESS_DPP-LABEL: add_i32_varying:
748; GFX7LESS_DPP:       ; %bb.0: ; %entry
749; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
750; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
751; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
752; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX7LESS_DPP-NEXT:    ds_add_rtn_u32 v0, v1, v0
754; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
755; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
756; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
757; GFX7LESS_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
758; GFX7LESS_DPP-NEXT:    s_endpgm
759;
760; GFX8_DPP-LABEL: add_i32_varying:
761; GFX8_DPP:       ; %bb.0: ; %entry
762; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
763; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
764; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
765; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
766; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
767; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
768; GFX8_DPP-NEXT:    s_nop 0
769; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
770; GFX8_DPP-NEXT:    s_nop 1
771; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
772; GFX8_DPP-NEXT:    s_nop 1
773; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
774; GFX8_DPP-NEXT:    s_nop 1
775; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
776; GFX8_DPP-NEXT:    s_nop 1
777; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
778; GFX8_DPP-NEXT:    s_nop 1
779; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
780; GFX8_DPP-NEXT:    v_readlane_b32 s2, v1, 63
781; GFX8_DPP-NEXT:    s_nop 0
782; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
783; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
784; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
785; GFX8_DPP-NEXT:    ; implicit-def: $vgpr0
786; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
787; GFX8_DPP-NEXT:    s_cbranch_execz .LBB2_2
788; GFX8_DPP-NEXT:  ; %bb.1:
789; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s2
790; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
791; GFX8_DPP-NEXT:    ds_add_rtn_u32 v0, v3, v0
792; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
793; GFX8_DPP-NEXT:  .LBB2_2:
794; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
795; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
796; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v0
797; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, v2
798; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
799; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
800; GFX8_DPP-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
801; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
802; GFX8_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
803; GFX8_DPP-NEXT:    s_endpgm
804;
805; GFX9_DPP-LABEL: add_i32_varying:
806; GFX9_DPP:       ; %bb.0: ; %entry
807; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
808; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
809; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
810; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
811; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
812; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
813; GFX9_DPP-NEXT:    s_nop 0
814; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
815; GFX9_DPP-NEXT:    s_nop 1
816; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
817; GFX9_DPP-NEXT:    s_nop 1
818; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
819; GFX9_DPP-NEXT:    s_nop 1
820; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
821; GFX9_DPP-NEXT:    s_nop 1
822; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
823; GFX9_DPP-NEXT:    s_nop 1
824; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
825; GFX9_DPP-NEXT:    v_readlane_b32 s2, v1, 63
826; GFX9_DPP-NEXT:    s_nop 0
827; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
828; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
829; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
830; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
831; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
832; GFX9_DPP-NEXT:    s_cbranch_execz .LBB2_2
833; GFX9_DPP-NEXT:  ; %bb.1:
834; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s2
835; GFX9_DPP-NEXT:    ds_add_rtn_u32 v0, v3, v0
836; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
837; GFX9_DPP-NEXT:  .LBB2_2:
838; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
839; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
840; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
841; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v2
842; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
843; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
844; GFX9_DPP-NEXT:    v_add_u32_e32 v0, s4, v0
845; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
846; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
847; GFX9_DPP-NEXT:    s_endpgm
848;
849; GFX1064_DPP-LABEL: add_i32_varying:
850; GFX1064_DPP:       ; %bb.0: ; %entry
851; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
852; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
853; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
854; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
855; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
856; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
857; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
858; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
859; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
860; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
861; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s2
862; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
863; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
864; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 15
865; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 31
866; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 16
867; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
868; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
869; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
870; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 47
871; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 63
872; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s3, 32
873; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
874; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
875; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
876; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
877; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 48
878; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
879; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
880; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
881; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
882; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
883; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB2_2
884; GFX1064_DPP-NEXT:  ; %bb.1:
885; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s6
886; GFX1064_DPP-NEXT:    s_mov_b32 s3, s6
887; GFX1064_DPP-NEXT:    ds_add_rtn_u32 v0, v4, v0
888; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
889; GFX1064_DPP-NEXT:    buffer_gl0_inv
890; GFX1064_DPP-NEXT:  .LBB2_2:
891; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
892; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
893; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
894; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v0
895; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
896; GFX1064_DPP-NEXT:    v_add_nc_u32_e32 v0, s3, v0
897; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
898; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
899; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
900; GFX1064_DPP-NEXT:    s_endpgm
901;
902; GFX1032_DPP-LABEL: add_i32_varying:
903; GFX1032_DPP:       ; %bb.0: ; %entry
904; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
905; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
906; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
907; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
908; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
909; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
910; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
911; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
912; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
913; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
914; GFX1032_DPP-NEXT:    v_readlane_b32 s2, v1, 31
915; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
916; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
917; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
918; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
919; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
920; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
921; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
922; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
923; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
924; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
925; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
926; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
927; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB2_2
928; GFX1032_DPP-NEXT:  ; %bb.1:
929; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s0
930; GFX1032_DPP-NEXT:    ds_add_rtn_u32 v0, v4, v0
931; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
932; GFX1032_DPP-NEXT:    buffer_gl0_inv
933; GFX1032_DPP-NEXT:  .LBB2_2:
934; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
935; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
936; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
937; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v0
938; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
939; GFX1032_DPP-NEXT:    v_add_nc_u32_e32 v0, s3, v0
940; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
941; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
942; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
943; GFX1032_DPP-NEXT:    s_endpgm
944;
945; GFX1164_DPP-LABEL: add_i32_varying:
946; GFX1164_DPP:       ; %bb.0: ; %entry
947; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
948; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
949; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
950; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
951; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
952; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
953; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
954; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
955; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
956; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
957; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
958; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
959; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
960; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
961; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
962; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
963; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s2
964; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
965; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
966; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
967; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 15
968; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 31
969; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
970; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 16
971; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
972; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
973; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
974; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 47
975; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 63
976; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s3, 32
977; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
978; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
979; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
980; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
981; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
982; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 48
983; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
984; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
985; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
986; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr0
987; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
988; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB2_2
989; GFX1164_DPP-NEXT:  ; %bb.1:
990; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, s6
991; GFX1164_DPP-NEXT:    s_mov_b32 s3, s6
992; GFX1164_DPP-NEXT:    ds_add_rtn_u32 v0, v4, v0
993; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
994; GFX1164_DPP-NEXT:    buffer_gl0_inv
995; GFX1164_DPP-NEXT:  .LBB2_2:
996; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
997; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
998; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v0
999; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, v3
1000; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1001; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1002; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
1003; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1004; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1005; GFX1164_DPP-NEXT:    s_endpgm
1006;
1007; GFX1132_DPP-LABEL: add_i32_varying:
1008; GFX1132_DPP:       ; %bb.0: ; %entry
1009; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1010; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
1011; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1012; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
1013; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
1014; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1015; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1016; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1017; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1018; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1019; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1020; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1021; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
1022; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1023; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
1024; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
1025; GFX1132_DPP-NEXT:    v_readlane_b32 s2, v1, 31
1026; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1027; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
1028; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1029; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
1030; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
1031; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
1032; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
1033; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
1034; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
1035; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1036; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
1037; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1038; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB2_2
1039; GFX1132_DPP-NEXT:  ; %bb.1:
1040; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
1041; GFX1132_DPP-NEXT:    ds_add_rtn_u32 v0, v4, v0
1042; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1043; GFX1132_DPP-NEXT:    buffer_gl0_inv
1044; GFX1132_DPP-NEXT:  .LBB2_2:
1045; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1046; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1047; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
1048; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
1049; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1050; GFX1132_DPP-NEXT:    v_add_nc_u32_e32 v0, s3, v0
1051; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
1052; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1053; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1054; GFX1132_DPP-NEXT:    s_endpgm
1055entry:
1056  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1057  %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel
1058  store i32 %old, ptr addrspace(1) %out
1059  ret void
1060}
1061
1062define amdgpu_kernel void @add_i32_varying_nouse() {
1063; GFX7LESS_ITERATIVE-LABEL: add_i32_varying_nouse:
1064; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
1065; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
1066; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, 0
1067; GFX7LESS_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
1068; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
1069; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1070; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
1071; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
1072; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1073; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
1074; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[4:5]
1075; GFX7LESS_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
1076; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB3_1
1077; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
1078; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1079; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
1080; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1081; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1082; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1083; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB3_4
1084; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
1085; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
1086; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s2
1087; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
1088; GFX7LESS_ITERATIVE-NEXT:    ds_add_u32 v0, v1
1089; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
1090; GFX7LESS_ITERATIVE-NEXT:  .LBB3_4:
1091; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
1092;
1093; GFX8_ITERATIVE-LABEL: add_i32_varying_nouse:
1094; GFX8_ITERATIVE:       ; %bb.0: ; %entry
1095; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
1096; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, 0
1097; GFX8_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
1098; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
1099; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1100; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
1101; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
1102; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
1103; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1104; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
1105; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
1106; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
1107; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1108; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1109; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1110; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1111; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1112; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB3_4
1113; GFX8_ITERATIVE-NEXT:  ; %bb.3:
1114; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
1115; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s2
1116; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
1117; GFX8_ITERATIVE-NEXT:    ds_add_u32 v0, v1
1118; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
1119; GFX8_ITERATIVE-NEXT:  .LBB3_4:
1120; GFX8_ITERATIVE-NEXT:    s_endpgm
1121;
1122; GFX9_ITERATIVE-LABEL: add_i32_varying_nouse:
1123; GFX9_ITERATIVE:       ; %bb.0: ; %entry
1124; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
1125; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, 0
1126; GFX9_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
1127; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
1128; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1129; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
1130; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
1131; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
1132; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1133; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
1134; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
1135; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
1136; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1137; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1138; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1139; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1140; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1141; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB3_4
1142; GFX9_ITERATIVE-NEXT:  ; %bb.3:
1143; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
1144; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s2
1145; GFX9_ITERATIVE-NEXT:    ds_add_u32 v0, v1
1146; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
1147; GFX9_ITERATIVE-NEXT:  .LBB3_4:
1148; GFX9_ITERATIVE-NEXT:    s_endpgm
1149;
1150; GFX1064_ITERATIVE-LABEL: add_i32_varying_nouse:
1151; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
1152; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
1153; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, 0
1154; GFX1064_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
1155; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
1156; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
1157; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
1158; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
1159; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1160; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
1161; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
1162; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
1163; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
1164; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1165; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1166; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1167; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1168; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1169; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB3_4
1170; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
1171; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
1172; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s2
1173; GFX1064_ITERATIVE-NEXT:    ds_add_u32 v0, v1
1174; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
1175; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
1176; GFX1064_ITERATIVE-NEXT:  .LBB3_4:
1177; GFX1064_ITERATIVE-NEXT:    s_endpgm
1178;
1179; GFX1032_ITERATIVE-LABEL: add_i32_varying_nouse:
1180; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
1181; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
1182; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, 0
1183; GFX1032_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
1184; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
1185; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
1186; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
1187; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
1188; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
1189; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
1190; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
1191; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
1192; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
1193; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1194; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1195; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1196; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
1197; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB3_4
1198; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
1199; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
1200; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s0
1201; GFX1032_ITERATIVE-NEXT:    ds_add_u32 v0, v1
1202; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
1203; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
1204; GFX1032_ITERATIVE-NEXT:  .LBB3_4:
1205; GFX1032_ITERATIVE-NEXT:    s_endpgm
1206;
1207; GFX1164_ITERATIVE-LABEL: add_i32_varying_nouse:
1208; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
1209; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1210; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
1211; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, 0
1212; GFX1164_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
1213; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
1214; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
1215; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1216; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
1217; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
1218; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1219; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
1220; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
1221; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
1222; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
1223; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
1224; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1225; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
1226; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1227; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1228; GFX1164_ITERATIVE-NEXT:    v_cmpx_eq_u32_e32 0, v0
1229; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1230; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB3_4
1231; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
1232; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
1233; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s2
1234; GFX1164_ITERATIVE-NEXT:    ds_add_u32 v0, v1
1235; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
1236; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
1237; GFX1164_ITERATIVE-NEXT:  .LBB3_4:
1238; GFX1164_ITERATIVE-NEXT:    s_endpgm
1239;
1240; GFX1132_ITERATIVE-LABEL: add_i32_varying_nouse:
1241; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
1242; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1243; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
1244; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, 0
1245; GFX1132_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
1246; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
1247; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
1248; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1249; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
1250; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
1251; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1252; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
1253; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
1254; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
1255; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
1256; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
1257; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1258; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
1259; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1260; GFX1132_ITERATIVE-NEXT:    v_cmpx_eq_u32_e32 0, v0
1261; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
1262; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB3_4
1263; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
1264; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
1265; GFX1132_ITERATIVE-NEXT:    ds_add_u32 v0, v1
1266; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
1267; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
1268; GFX1132_ITERATIVE-NEXT:  .LBB3_4:
1269; GFX1132_ITERATIVE-NEXT:    s_endpgm
1270;
1271; GFX7LESS_DPP-LABEL: add_i32_varying_nouse:
1272; GFX7LESS_DPP:       ; %bb.0: ; %entry
1273; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
1274; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
1275; GFX7LESS_DPP-NEXT:    ds_add_u32 v1, v0
1276; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1277; GFX7LESS_DPP-NEXT:    s_endpgm
1278;
1279; GFX8_DPP-LABEL: add_i32_varying_nouse:
1280; GFX8_DPP:       ; %bb.0: ; %entry
1281; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
1282; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1283; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1284; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
1285; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
1286; GFX8_DPP-NEXT:    s_nop 1
1287; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1288; GFX8_DPP-NEXT:    s_nop 1
1289; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1290; GFX8_DPP-NEXT:    s_nop 1
1291; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1292; GFX8_DPP-NEXT:    s_nop 1
1293; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1294; GFX8_DPP-NEXT:    s_nop 1
1295; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
1296; GFX8_DPP-NEXT:    s_nop 1
1297; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
1298; GFX8_DPP-NEXT:    v_readlane_b32 s2, v1, 63
1299; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
1300; GFX8_DPP-NEXT:    s_mov_b32 s0, s2
1301; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1302; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1303; GFX8_DPP-NEXT:    s_cbranch_execz .LBB3_2
1304; GFX8_DPP-NEXT:  ; %bb.1:
1305; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s0
1306; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
1307; GFX8_DPP-NEXT:    ds_add_u32 v2, v0
1308; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1309; GFX8_DPP-NEXT:  .LBB3_2:
1310; GFX8_DPP-NEXT:    s_endpgm
1311;
1312; GFX9_DPP-LABEL: add_i32_varying_nouse:
1313; GFX9_DPP:       ; %bb.0: ; %entry
1314; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
1315; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1316; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1317; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
1318; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
1319; GFX9_DPP-NEXT:    s_nop 1
1320; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1321; GFX9_DPP-NEXT:    s_nop 1
1322; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1323; GFX9_DPP-NEXT:    s_nop 1
1324; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1325; GFX9_DPP-NEXT:    s_nop 1
1326; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1327; GFX9_DPP-NEXT:    s_nop 1
1328; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
1329; GFX9_DPP-NEXT:    s_nop 1
1330; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
1331; GFX9_DPP-NEXT:    v_readlane_b32 s2, v1, 63
1332; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
1333; GFX9_DPP-NEXT:    s_mov_b32 s0, s2
1334; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1335; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1336; GFX9_DPP-NEXT:    s_cbranch_execz .LBB3_2
1337; GFX9_DPP-NEXT:  ; %bb.1:
1338; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s0
1339; GFX9_DPP-NEXT:    ds_add_u32 v2, v0
1340; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1341; GFX9_DPP-NEXT:  .LBB3_2:
1342; GFX9_DPP-NEXT:    s_endpgm
1343;
1344; GFX1064_DPP-LABEL: add_i32_varying_nouse:
1345; GFX1064_DPP:       ; %bb.0: ; %entry
1346; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
1347; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
1348; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1349; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1350; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1351; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1352; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
1353; GFX1064_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1354; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
1355; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1356; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
1357; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 0
1358; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 32
1359; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
1360; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
1361; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
1362; GFX1064_DPP-NEXT:    s_add_i32 s0, s2, s3
1363; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1364; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1365; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB3_2
1366; GFX1064_DPP-NEXT:  ; %bb.1:
1367; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, s0
1368; GFX1064_DPP-NEXT:    ds_add_u32 v0, v3
1369; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1370; GFX1064_DPP-NEXT:    buffer_gl0_inv
1371; GFX1064_DPP-NEXT:  .LBB3_2:
1372; GFX1064_DPP-NEXT:    s_endpgm
1373;
1374; GFX1032_DPP-LABEL: add_i32_varying_nouse:
1375; GFX1032_DPP:       ; %bb.0: ; %entry
1376; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
1377; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
1378; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1379; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1380; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1381; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1382; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
1383; GFX1032_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1384; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
1385; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
1386; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
1387; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v1
1388; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
1389; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1390; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB3_2
1391; GFX1032_DPP-NEXT:  ; %bb.1:
1392; GFX1032_DPP-NEXT:    ds_add_u32 v0, v3
1393; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1394; GFX1032_DPP-NEXT:    buffer_gl0_inv
1395; GFX1032_DPP-NEXT:  .LBB3_2:
1396; GFX1032_DPP-NEXT:    s_endpgm
1397;
1398; GFX1164_DPP-LABEL: add_i32_varying_nouse:
1399; GFX1164_DPP:       ; %bb.0: ; %entry
1400; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1401; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
1402; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1403; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
1404; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1405; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1406; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1407; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1408; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1409; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1410; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1411; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
1412; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1413; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1414; GFX1164_DPP-NEXT:    v_permlane64_b32 v2, v1
1415; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
1416; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1417; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
1418; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfffe
1419; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1420; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
1421; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1422; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
1423; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
1424; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
1425; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], exec
1426; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
1427; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
1428; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB3_2
1429; GFX1164_DPP-NEXT:  ; %bb.1:
1430; GFX1164_DPP-NEXT:    ds_add_u32 v0, v3
1431; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1432; GFX1164_DPP-NEXT:    buffer_gl0_inv
1433; GFX1164_DPP-NEXT:  .LBB3_2:
1434; GFX1164_DPP-NEXT:    s_endpgm
1435;
1436; GFX1132_DPP-LABEL: add_i32_varying_nouse:
1437; GFX1132_DPP:       ; %bb.0: ; %entry
1438; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1439; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
1440; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1441; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
1442; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1443; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1444; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1445; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1446; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1447; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1448; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1449; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
1450; GFX1132_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1451; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
1452; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1453; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
1454; GFX1132_DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
1455; GFX1132_DPP-NEXT:    s_mov_b32 s0, exec_lo
1456; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1457; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
1458; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB3_2
1459; GFX1132_DPP-NEXT:  ; %bb.1:
1460; GFX1132_DPP-NEXT:    ds_add_u32 v0, v3
1461; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
1462; GFX1132_DPP-NEXT:    buffer_gl0_inv
1463; GFX1132_DPP-NEXT:  .LBB3_2:
1464; GFX1132_DPP-NEXT:    s_endpgm
1465entry:
1466  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1467  %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel
1468  ret void
1469}
1470
1471define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
1472; GFX7LESS-LABEL: add_i64_constant:
1473; GFX7LESS:       ; %bb.0: ; %entry
1474; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1475; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1476; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s3, v0
1477; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1478; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1479; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1480; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_2
1481; GFX7LESS-NEXT:  ; %bb.1:
1482; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1483; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
1484; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1485; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
1486; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1487; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1488; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1489; GFX7LESS-NEXT:  .LBB4_2:
1490; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1491; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1492; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1493; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1494; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
1495; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v0
1496; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1497; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1498; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
1499; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
1500; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1501; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1502; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1503; GFX7LESS-NEXT:    s_endpgm
1504;
1505; GFX8-LABEL: add_i64_constant:
1506; GFX8:       ; %bb.0: ; %entry
1507; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1508; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1509; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s3, v0
1510; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1511; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1512; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1513; GFX8-NEXT:    s_cbranch_execz .LBB4_2
1514; GFX8-NEXT:  ; %bb.1:
1515; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1516; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1517; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1518; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1519; GFX8-NEXT:    s_mov_b32 m0, -1
1520; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1521; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1522; GFX8-NEXT:  .LBB4_2:
1523; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1524; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1525; GFX8-NEXT:    v_readfirstlane_b32 s3, v0
1526; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1527; GFX8-NEXT:    v_mov_b32_e32 v0, s3
1528; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1529; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1530; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1531; GFX8-NEXT:    s_mov_b32 s2, -1
1532; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1533; GFX8-NEXT:    s_nop 1
1534; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1535; GFX8-NEXT:    s_endpgm
1536;
1537; GFX9-LABEL: add_i64_constant:
1538; GFX9:       ; %bb.0: ; %entry
1539; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1540; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1541; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s3, v0
1542; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1543; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1544; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1545; GFX9-NEXT:    s_cbranch_execz .LBB4_2
1546; GFX9-NEXT:  ; %bb.1:
1547; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1548; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1549; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1550; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1551; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1552; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1553; GFX9-NEXT:  .LBB4_2:
1554; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1555; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1556; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
1557; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1558; GFX9-NEXT:    v_mov_b32_e32 v0, s3
1559; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1560; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
1561; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1562; GFX9-NEXT:    s_mov_b32 s2, -1
1563; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1564; GFX9-NEXT:    s_nop 1
1565; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1566; GFX9-NEXT:    s_endpgm
1567;
1568; GFX1064-LABEL: add_i64_constant:
1569; GFX1064:       ; %bb.0: ; %entry
1570; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1571; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1572; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s3, v0
1573; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1574; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1575; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1576; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
1577; GFX1064-NEXT:  ; %bb.1:
1578; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1579; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1580; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
1581; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
1582; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1583; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1584; GFX1064-NEXT:    buffer_gl0_inv
1585; GFX1064-NEXT:  .LBB4_2:
1586; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1587; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1588; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1589; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
1590; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
1591; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
1592; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1593; GFX1064-NEXT:    s_mov_b32 s2, -1
1594; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1595; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1596; GFX1064-NEXT:    s_endpgm
1597;
1598; GFX1032-LABEL: add_i64_constant:
1599; GFX1032:       ; %bb.0: ; %entry
1600; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
1601; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1602; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s1, 0
1603; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1604; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1605; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
1606; GFX1032-NEXT:  ; %bb.1:
1607; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
1608; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1609; GFX1032-NEXT:    s_mul_i32 s1, s1, 5
1610; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
1611; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1612; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1613; GFX1032-NEXT:    buffer_gl0_inv
1614; GFX1032-NEXT:  .LBB4_2:
1615; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1616; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1617; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1618; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
1619; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
1620; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
1621; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1622; GFX1032-NEXT:    s_mov_b32 s2, -1
1623; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1624; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1625; GFX1032-NEXT:    s_endpgm
1626;
1627; GFX1164-LABEL: add_i64_constant:
1628; GFX1164:       ; %bb.0: ; %entry
1629; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1630; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
1631; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1632; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1633; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s3, v0
1634; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1635; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1636; GFX1164-NEXT:    s_cbranch_execz .LBB4_2
1637; GFX1164-NEXT:  ; %bb.1:
1638; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1639; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1640; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
1641; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1642; GFX1164-NEXT:    v_mov_b32_e32 v0, s2
1643; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1644; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1645; GFX1164-NEXT:    buffer_gl0_inv
1646; GFX1164-NEXT:  .LBB4_2:
1647; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
1648; GFX1164-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1649; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
1650; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
1651; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1652; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1653; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1654; GFX1164-NEXT:    s_mov_b32 s2, -1
1655; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1656; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1657; GFX1164-NEXT:    s_endpgm
1658;
1659; GFX1132-LABEL: add_i64_constant:
1660; GFX1132:       ; %bb.0: ; %entry
1661; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
1662; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
1663; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s1, 0
1664; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1665; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1666; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1667; GFX1132-NEXT:    s_cbranch_execz .LBB4_2
1668; GFX1132-NEXT:  ; %bb.1:
1669; GFX1132-NEXT:    s_bcnt1_i32_b32 s1, s1
1670; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
1671; GFX1132-NEXT:    s_mul_i32 s1, s1, 5
1672; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1673; GFX1132-NEXT:    v_mov_b32_e32 v0, s1
1674; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
1675; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1676; GFX1132-NEXT:    buffer_gl0_inv
1677; GFX1132-NEXT:  .LBB4_2:
1678; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1679; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1680; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
1681; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
1682; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1683; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1684; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1685; GFX1132-NEXT:    s_mov_b32 s2, -1
1686; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1687; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1688; GFX1132-NEXT:    s_endpgm
1689entry:
1690  %old = atomicrmw add ptr addrspace(3) @local_var64, i64 5 acq_rel
1691  store i64 %old, ptr addrspace(1) %out
1692  ret void
1693}
1694
1695define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) {
1696; GFX7LESS-LABEL: add_i64_uniform:
1697; GFX7LESS:       ; %bb.0: ; %entry
1698; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1699; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1700; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1701; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
1702; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1703; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1704; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1705; GFX7LESS-NEXT:    s_cbranch_execz .LBB5_2
1706; GFX7LESS-NEXT:  ; %bb.1:
1707; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1708; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
1709; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1710; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
1711; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1712; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
1713; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
1714; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
1715; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1716; GFX7LESS-NEXT:    s_mov_b32 m0, -1
1717; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1718; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1719; GFX7LESS-NEXT:  .LBB5_2:
1720; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1721; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1722; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1723; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1724; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1725; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1726; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1727; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v0
1728; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
1729; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
1730; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
1731; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
1732; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s0
1733; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s1, v2
1734; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
1735; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1736; GFX7LESS-NEXT:    s_endpgm
1737;
1738; GFX8-LABEL: add_i64_uniform:
1739; GFX8:       ; %bb.0: ; %entry
1740; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1741; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1742; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1743; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1744; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1745; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1746; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1747; GFX8-NEXT:    s_cbranch_execz .LBB5_2
1748; GFX8-NEXT:  ; %bb.1:
1749; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
1750; GFX8-NEXT:    v_mov_b32_e32 v0, s8
1751; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1752; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
1753; GFX8-NEXT:    s_mul_i32 s6, s3, s8
1754; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1755; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
1756; GFX8-NEXT:    s_mov_b32 m0, -1
1757; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1758; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1759; GFX8-NEXT:  .LBB5_2:
1760; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1761; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
1762; GFX8-NEXT:    v_readfirstlane_b32 s5, v0
1763; GFX8-NEXT:    v_mov_b32_e32 v0, s5
1764; GFX8-NEXT:    v_mov_b32_e32 v1, s4
1765; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1766; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v2
1767; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
1768; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1769; GFX8-NEXT:    s_mov_b32 s6, -1
1770; GFX8-NEXT:    s_mov_b32 s4, s0
1771; GFX8-NEXT:    s_mov_b32 s5, s1
1772; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1773; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1774; GFX8-NEXT:    s_endpgm
1775;
1776; GFX9-LABEL: add_i64_uniform:
1777; GFX9:       ; %bb.0: ; %entry
1778; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1779; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1780; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1781; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1782; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1783; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1784; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1785; GFX9-NEXT:    s_cbranch_execz .LBB5_2
1786; GFX9-NEXT:  ; %bb.1:
1787; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1788; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1789; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1790; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1791; GFX9-NEXT:    s_add_i32 s8, s8, s7
1792; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1793; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1794; GFX9-NEXT:    v_mov_b32_e32 v1, s8
1795; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1796; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1797; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1798; GFX9-NEXT:  .LBB5_2:
1799; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1800; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
1801; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
1802; GFX9-NEXT:    v_mov_b32_e32 v0, s5
1803; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1804; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1805; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
1806; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1807; GFX9-NEXT:    s_mov_b32 s6, -1
1808; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1809; GFX9-NEXT:    s_mov_b32 s4, s0
1810; GFX9-NEXT:    s_mov_b32 s5, s1
1811; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1812; GFX9-NEXT:    s_endpgm
1813;
1814; GFX1064-LABEL: add_i64_uniform:
1815; GFX1064:       ; %bb.0: ; %entry
1816; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1817; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1818; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1819; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1820; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1821; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1822; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1823; GFX1064-NEXT:    s_cbranch_execz .LBB5_2
1824; GFX1064-NEXT:  ; %bb.1:
1825; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1826; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1827; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1828; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
1829; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
1830; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
1831; GFX1064-NEXT:    s_add_i32 s8, s8, s7
1832; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
1833; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
1834; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1835; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1836; GFX1064-NEXT:    buffer_gl0_inv
1837; GFX1064-NEXT:  .LBB5_2:
1838; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1839; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1840; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
1841; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
1842; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1843; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
1844; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
1845; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1846; GFX1064-NEXT:    s_mov_b32 s2, -1
1847; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1848; GFX1064-NEXT:    s_endpgm
1849;
1850; GFX1032-LABEL: add_i64_uniform:
1851; GFX1032:       ; %bb.0: ; %entry
1852; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1853; GFX1032-NEXT:    s_mov_b32 s6, exec_lo
1854; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1855; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
1856; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1857; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1858; GFX1032-NEXT:    s_cbranch_execz .LBB5_2
1859; GFX1032-NEXT:  ; %bb.1:
1860; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s6
1861; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1862; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1863; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
1864; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
1865; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
1866; GFX1032-NEXT:    s_add_i32 s7, s7, s6
1867; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
1868; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
1869; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1870; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1871; GFX1032-NEXT:    buffer_gl0_inv
1872; GFX1032-NEXT:  .LBB5_2:
1873; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1874; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1875; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
1876; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
1877; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1878; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
1879; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
1880; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1881; GFX1032-NEXT:    s_mov_b32 s2, -1
1882; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1883; GFX1032-NEXT:    s_endpgm
1884;
1885; GFX1164-LABEL: add_i64_uniform:
1886; GFX1164:       ; %bb.0: ; %entry
1887; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1888; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
1889; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1890; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1891; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1892; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1893; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1894; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1895; GFX1164-NEXT:    s_cbranch_execz .LBB5_2
1896; GFX1164-NEXT:  ; %bb.1:
1897; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1898; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
1899; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1900; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
1901; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
1902; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
1903; GFX1164-NEXT:    s_add_i32 s8, s8, s7
1904; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
1905; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
1906; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1907; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1908; GFX1164-NEXT:    buffer_gl0_inv
1909; GFX1164-NEXT:  .LBB5_2:
1910; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1911; GFX1164-NEXT:    v_readfirstlane_b32 s5, v1
1912; GFX1164-NEXT:    v_readfirstlane_b32 s4, v0
1913; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1914; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1915; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1916; GFX1164-NEXT:    s_mov_b32 s2, -1
1917; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
1918; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1919; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1920; GFX1164-NEXT:    v_mov_b32_e32 v1, v3
1921; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1922; GFX1164-NEXT:    s_endpgm
1923;
1924; GFX1132-LABEL: add_i64_uniform:
1925; GFX1132:       ; %bb.0: ; %entry
1926; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1927; GFX1132-NEXT:    s_mov_b32 s6, exec_lo
1928; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
1929; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
1930; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1931; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1932; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1933; GFX1132-NEXT:    s_cbranch_execz .LBB5_2
1934; GFX1132-NEXT:  ; %bb.1:
1935; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s6
1936; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
1937; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1938; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
1939; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
1940; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
1941; GFX1132-NEXT:    s_add_i32 s7, s7, s6
1942; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1943; GFX1132-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7
1944; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
1945; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1946; GFX1132-NEXT:    buffer_gl0_inv
1947; GFX1132-NEXT:  .LBB5_2:
1948; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1949; GFX1132-NEXT:    v_readfirstlane_b32 s5, v1
1950; GFX1132-NEXT:    v_readfirstlane_b32 s4, v0
1951; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1952; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1953; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
1954; GFX1132-NEXT:    s_mov_b32 s2, -1
1955; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
1956; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1957; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1958; GFX1132-NEXT:    v_mov_b32_e32 v1, v3
1959; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1960; GFX1132-NEXT:    s_endpgm
1961entry:
1962  %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %additive acq_rel
1963  store i64 %old, ptr addrspace(1) %out
1964  ret void
1965}
1966
1967define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
1968; GFX7LESS_ITERATIVE-LABEL: add_i64_varying:
1969; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
1970; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
1971; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
1972; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
1973; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
1974; GFX7LESS_ITERATIVE-NEXT:  .LBB6_1: ; %ComputeLoop
1975; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
1976; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
1977; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s6
1978; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
1979; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s6
1980; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
1981; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
1982; GFX7LESS_ITERATIVE-NEXT:    s_add_u32 s0, s0, s8
1983; GFX7LESS_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
1984; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
1985; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
1986; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
1987; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
1988; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB6_1
1989; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
1990; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1991; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
1992; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1993; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
1994; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1995; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
1996; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB6_4
1997; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
1998; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
1999; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
2000; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
2001; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
2002; GFX7LESS_ITERATIVE-NEXT:    ds_add_rtn_u64 v[3:4], v0, v[3:4]
2003; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2004; GFX7LESS_ITERATIVE-NEXT:  .LBB6_4:
2005; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
2006; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2007; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
2008; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2009; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
2010; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
2011; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
2012; GFX7LESS_ITERATIVE-NEXT:    v_add_i32_e32 v0, vcc, s5, v1
2013; GFX7LESS_ITERATIVE-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
2014; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2015; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2016; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
2017;
2018; GFX8_ITERATIVE-LABEL: add_i64_varying:
2019; GFX8_ITERATIVE:       ; %bb.0: ; %entry
2020; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
2021; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
2022; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
2023; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
2024; GFX8_ITERATIVE-NEXT:  .LBB6_1: ; %ComputeLoop
2025; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2026; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
2027; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s6
2028; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s6
2029; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
2030; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
2031; GFX8_ITERATIVE-NEXT:    s_add_u32 s0, s0, s8
2032; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
2033; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
2034; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
2035; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
2036; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
2037; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
2038; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2039; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2040; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2041; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2042; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
2043; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2044; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
2045; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB6_4
2046; GFX8_ITERATIVE-NEXT:  ; %bb.3:
2047; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
2048; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
2049; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
2050; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
2051; GFX8_ITERATIVE-NEXT:    ds_add_rtn_u64 v[3:4], v0, v[3:4]
2052; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2053; GFX8_ITERATIVE-NEXT:  .LBB6_4:
2054; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
2055; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2056; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
2057; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
2058; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
2059; GFX8_ITERATIVE-NEXT:    v_add_u32_e32 v0, vcc, s5, v1
2060; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
2061; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2062; GFX8_ITERATIVE-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
2063; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2064; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2065; GFX8_ITERATIVE-NEXT:    s_endpgm
2066;
2067; GFX9_ITERATIVE-LABEL: add_i64_varying:
2068; GFX9_ITERATIVE:       ; %bb.0: ; %entry
2069; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
2070; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
2071; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
2072; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
2073; GFX9_ITERATIVE-NEXT:  .LBB6_1: ; %ComputeLoop
2074; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2075; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
2076; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s6
2077; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s6
2078; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
2079; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
2080; GFX9_ITERATIVE-NEXT:    s_add_u32 s0, s0, s8
2081; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
2082; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
2083; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
2084; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
2085; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
2086; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
2087; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2088; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2089; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2090; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2091; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
2092; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2093; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
2094; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB6_4
2095; GFX9_ITERATIVE-NEXT:  ; %bb.3:
2096; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
2097; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
2098; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
2099; GFX9_ITERATIVE-NEXT:    ds_add_rtn_u64 v[3:4], v0, v[3:4]
2100; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2101; GFX9_ITERATIVE-NEXT:  .LBB6_4:
2102; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
2103; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2104; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
2105; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
2106; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
2107; GFX9_ITERATIVE-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v1
2108; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
2109; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2110; GFX9_ITERATIVE-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
2111; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2112; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2113; GFX9_ITERATIVE-NEXT:    s_endpgm
2114;
2115; GFX1064_ITERATIVE-LABEL: add_i64_varying:
2116; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
2117; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
2118; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
2119; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
2120; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
2121; GFX1064_ITERATIVE-NEXT:  .LBB6_1: ; %ComputeLoop
2122; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2123; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
2124; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
2125; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
2126; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
2127; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
2128; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
2129; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
2130; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
2131; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
2132; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
2133; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
2134; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2135; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2136; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
2137; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2138; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2139; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2140; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
2141; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB6_4
2142; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
2143; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
2144; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
2145; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
2146; GFX1064_ITERATIVE-NEXT:    ds_add_rtn_u64 v[3:4], v0, v[3:4]
2147; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2148; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
2149; GFX1064_ITERATIVE-NEXT:  .LBB6_4:
2150; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
2151; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
2152; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2153; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
2154; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
2155; GFX1064_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc, s2, v1
2156; GFX1064_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
2157; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
2158; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2159; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2160; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2161; GFX1064_ITERATIVE-NEXT:    s_endpgm
2162;
2163; GFX1032_ITERATIVE-LABEL: add_i64_varying:
2164; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
2165; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
2166; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
2167; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
2168; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
2169; GFX1032_ITERATIVE-NEXT:  .LBB6_1: ; %ComputeLoop
2170; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2171; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
2172; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
2173; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
2174; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
2175; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
2176; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
2177; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
2178; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
2179; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
2180; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
2181; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
2182; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2183; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2184; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
2185; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2186; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2187; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
2188; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB6_4
2189; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
2190; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
2191; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
2192; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
2193; GFX1032_ITERATIVE-NEXT:    ds_add_rtn_u64 v[3:4], v0, v[3:4]
2194; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2195; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
2196; GFX1032_ITERATIVE-NEXT:  .LBB6_4:
2197; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
2198; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2199; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2200; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
2201; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
2202; GFX1032_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v1
2203; GFX1032_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
2204; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
2205; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2206; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2207; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2208; GFX1032_ITERATIVE-NEXT:    s_endpgm
2209;
2210; GFX1164_ITERATIVE-LABEL: add_i64_varying:
2211; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
2212; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
2213; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
2214; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
2215; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
2216; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
2217; GFX1164_ITERATIVE-NEXT:  .LBB6_1: ; %ComputeLoop
2218; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2219; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s6, s[2:3]
2220; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
2221; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s6
2222; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
2223; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
2224; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s6
2225; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
2226; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2227; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
2228; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
2229; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
2230; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2231; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
2232; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
2233; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2234; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2235; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2236; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2237; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2238; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
2239; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2240; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2241; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
2242; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB6_4
2243; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
2244; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s1
2245; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
2246; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
2247; GFX1164_ITERATIVE-NEXT:    ds_add_rtn_u64 v[2:3], v4, v[2:3]
2248; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2249; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
2250; GFX1164_ITERATIVE-NEXT:  .LBB6_4:
2251; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
2252; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2253; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
2254; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
2255; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2256; GFX1164_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc, s2, v0
2257; GFX1164_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2258; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
2259; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2260; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2261; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2262; GFX1164_ITERATIVE-NEXT:    s_endpgm
2263;
2264; GFX1132_ITERATIVE-LABEL: add_i64_varying:
2265; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
2266; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
2267; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
2268; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
2269; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
2270; GFX1132_ITERATIVE-NEXT:  .LBB6_1: ; %ComputeLoop
2271; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2272; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
2273; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
2274; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
2275; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
2276; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
2277; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
2278; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2279; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
2280; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
2281; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
2282; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2283; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
2284; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
2285; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
2286; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2287; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2288; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2289; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2290; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
2291; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
2292; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
2293; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB6_4
2294; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
2295; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1
2296; GFX1132_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
2297; GFX1132_ITERATIVE-NEXT:    ds_add_rtn_u64 v[2:3], v4, v[2:3]
2298; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2299; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
2300; GFX1132_ITERATIVE-NEXT:  .LBB6_4:
2301; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
2302; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2303; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
2304; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
2305; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2306; GFX1132_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v0
2307; GFX1132_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2308; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
2309; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
2310; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2311; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2312; GFX1132_ITERATIVE-NEXT:    s_endpgm
2313;
2314; GFX7LESS_DPP-LABEL: add_i64_varying:
2315; GFX7LESS_DPP:       ; %bb.0: ; %entry
2316; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2317; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
2318; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
2319; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2320; GFX7LESS_DPP-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
2321; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2322; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
2323; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
2324; GFX7LESS_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2325; GFX7LESS_DPP-NEXT:    s_endpgm
2326;
2327; GFX8_DPP-LABEL: add_i64_varying:
2328; GFX8_DPP:       ; %bb.0: ; %entry
2329; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
2330; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
2331; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
2332; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
2333; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
2334; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
2335; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
2336; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
2337; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2338; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2339; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2340; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
2341; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
2342; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
2343; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2344; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2345; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2346; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
2347; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
2348; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
2349; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2350; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2351; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2352; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
2353; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
2354; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
2355; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2356; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2357; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2358; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
2359; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
2360; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
2361; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2362; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2363; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2364; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
2365; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
2366; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
2367; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2368; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
2369; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2370; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
2371; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
2372; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
2373; GFX8_DPP-NEXT:    v_readlane_b32 s3, v1, 63
2374; GFX8_DPP-NEXT:    v_readlane_b32 s2, v2, 63
2375; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
2376; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2377; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
2378; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
2379; GFX8_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
2380; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2381; GFX8_DPP-NEXT:    s_cbranch_execz .LBB6_2
2382; GFX8_DPP-NEXT:  ; %bb.1:
2383; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s3
2384; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, s2
2385; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
2386; GFX8_DPP-NEXT:    ds_add_rtn_u64 v[5:6], v7, v[5:6]
2387; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2388; GFX8_DPP-NEXT:  .LBB6_2:
2389; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
2390; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2391; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v6
2392; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v5
2393; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
2394; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
2395; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
2396; GFX8_DPP-NEXT:    v_add_u32_e32 v5, vcc, s5, v5
2397; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
2398; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
2399; GFX8_DPP-NEXT:    v_addc_u32_e32 v6, vcc, v0, v6, vcc
2400; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2401; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
2402; GFX8_DPP-NEXT:    s_endpgm
2403;
2404; GFX9_DPP-LABEL: add_i64_varying:
2405; GFX9_DPP:       ; %bb.0: ; %entry
2406; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
2407; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
2408; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
2409; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
2410; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
2411; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
2412; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
2413; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
2414; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2415; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2416; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2417; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
2418; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
2419; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
2420; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2421; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2422; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2423; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
2424; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
2425; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
2426; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2427; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2428; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2429; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
2430; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
2431; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
2432; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2433; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2434; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2435; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
2436; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
2437; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
2438; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2439; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2440; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
2441; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
2442; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
2443; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
2444; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2445; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2446; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
2447; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
2448; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
2449; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
2450; GFX9_DPP-NEXT:    v_readlane_b32 s3, v1, 63
2451; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
2452; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
2453; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2454; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
2455; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
2456; GFX9_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
2457; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2458; GFX9_DPP-NEXT:    s_cbranch_execz .LBB6_2
2459; GFX9_DPP-NEXT:  ; %bb.1:
2460; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s3
2461; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, s2
2462; GFX9_DPP-NEXT:    ds_add_rtn_u64 v[5:6], v7, v[5:6]
2463; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2464; GFX9_DPP-NEXT:  .LBB6_2:
2465; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
2466; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2467; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v6
2468; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v5
2469; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
2470; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
2471; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
2472; GFX9_DPP-NEXT:    v_add_co_u32_e32 v5, vcc, s5, v5
2473; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
2474; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
2475; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v6, vcc, v0, v6, vcc
2476; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2477; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
2478; GFX9_DPP-NEXT:    s_endpgm
2479;
2480; GFX1064_DPP-LABEL: add_i64_varying:
2481; GFX1064_DPP:       ; %bb.0: ; %entry
2482; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
2483; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
2484; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
2485; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
2486; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
2487; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
2488; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
2489; GFX1064_DPP-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2490; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, 0
2491; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
2492; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, 0
2493; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v2, v1
2494; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc
2495; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
2496; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2497; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
2498; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2499; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v6
2500; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
2501; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
2502; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2503; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
2504; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2505; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v4
2506; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc
2507; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
2508; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2509; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2510; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v6
2511; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
2512; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
2513; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
2514; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v2, -1, -1
2515; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2516; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2517; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v3
2518; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
2519; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
2520; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
2521; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
2522; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 31
2523; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, s2
2524; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s3
2525; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2526; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2527; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v3
2528; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
2529; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
2530; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2531; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
2532; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2533; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2534; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 15
2535; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 15
2536; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 31
2537; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
2538; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
2539; GFX1064_DPP-NEXT:    v_writelane_b32 v8, s2, 16
2540; GFX1064_DPP-NEXT:    v_writelane_b32 v7, s3, 16
2541; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 63
2542; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v2, 47
2543; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 63
2544; GFX1064_DPP-NEXT:    v_writelane_b32 v8, s6, 32
2545; GFX1064_DPP-NEXT:    v_writelane_b32 v7, s7, 32
2546; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
2547; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
2548; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
2549; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
2550; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
2551; GFX1064_DPP-NEXT:    v_writelane_b32 v8, s9, 48
2552; GFX1064_DPP-NEXT:    v_writelane_b32 v7, s8, 48
2553; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
2554; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
2555; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
2556; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
2557; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
2558; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB6_2
2559; GFX1064_DPP-NEXT:  ; %bb.1:
2560; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
2561; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
2562; GFX1064_DPP-NEXT:    ds_add_rtn_u64 v[9:10], v0, v[9:10]
2563; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2564; GFX1064_DPP-NEXT:    buffer_gl0_inv
2565; GFX1064_DPP-NEXT:  .LBB6_2:
2566; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
2567; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
2568; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2569; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v9
2570; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v7
2571; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, v8
2572; GFX1064_DPP-NEXT:    s_mov_b32 null, 0
2573; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s4, v10
2574; GFX1064_DPP-NEXT:    v_add_co_u32 v9, vcc, s3, v11
2575; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
2576; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc, s4, v12, vcc
2577; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2578; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
2579; GFX1064_DPP-NEXT:    s_endpgm
2580;
2581; GFX1032_DPP-LABEL: add_i64_varying:
2582; GFX1032_DPP:       ; %bb.0: ; %entry
2583; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
2584; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
2585; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s2
2586; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s2
2587; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
2588; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
2589; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
2590; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2591; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, 0
2592; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
2593; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, 0
2594; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v1
2595; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo
2596; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
2597; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2598; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
2599; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf
2600; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v6
2601; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
2602; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
2603; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2604; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
2605; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
2606; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v4
2607; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo
2608; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
2609; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2610; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf
2611; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v6
2612; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
2613; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
2614; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
2615; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v2, -1, -1
2616; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2617; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2618; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
2619; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
2620; GFX1032_DPP-NEXT:    v_readlane_b32 s3, v1, 15
2621; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
2622; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
2623; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2624; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2625; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
2626; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
2627; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
2628; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
2629; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
2630; GFX1032_DPP-NEXT:    v_writelane_b32 v8, s6, 16
2631; GFX1032_DPP-NEXT:    v_writelane_b32 v7, s3, 16
2632; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
2633; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
2634; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
2635; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
2636; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2637; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB6_2
2638; GFX1032_DPP-NEXT:  ; %bb.1:
2639; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
2640; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
2641; GFX1032_DPP-NEXT:    ds_add_rtn_u64 v[9:10], v0, v[9:10]
2642; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2643; GFX1032_DPP-NEXT:    buffer_gl0_inv
2644; GFX1032_DPP-NEXT:  .LBB6_2:
2645; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
2646; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2647; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2648; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v9
2649; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v7
2650; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v8
2651; GFX1032_DPP-NEXT:    s_mov_b32 null, 0
2652; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s4, v10
2653; GFX1032_DPP-NEXT:    v_add_co_u32 v9, vcc_lo, s3, v11
2654; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
2655; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s4, v12, vcc_lo
2656; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2657; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
2658; GFX1032_DPP-NEXT:    s_endpgm
2659;
2660; GFX1164_DPP-LABEL: add_i64_varying:
2661; GFX1164_DPP:       ; %bb.0: ; %entry
2662; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2663; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
2664; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2665; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
2666; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
2667; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
2668; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
2669; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, 0
2670; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2671; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2672; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2673; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2674; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
2675; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2676; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
2677; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2678; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2679; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
2680; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2681; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2682; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
2683; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2684; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
2685; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
2686; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2687; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
2688; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2689; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2690; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
2691; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2692; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
2693; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2694; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
2695; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2696; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
2697; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2698; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2699; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
2700; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
2701; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
2702; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2703; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s2
2704; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 31
2705; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2706; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2707; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2708; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
2709; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
2710; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2711; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2712; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
2713; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2714; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 15
2715; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2716; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 15
2717; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 31
2718; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
2719; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s2, 16
2720; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 63
2721; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s3, 16
2722; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v2, 47
2723; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v1, 47
2724; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 63
2725; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s6, 32
2726; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s7, 32
2727; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
2728; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2729; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
2730; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
2731; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
2732; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
2733; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s8, 48
2734; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s9, 48
2735; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
2736; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
2737; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
2738; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
2739; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
2740; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB6_2
2741; GFX1164_DPP-NEXT:  ; %bb.1:
2742; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
2743; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
2744; GFX1164_DPP-NEXT:    ds_add_rtn_u64 v[7:8], v0, v[7:8]
2745; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2746; GFX1164_DPP-NEXT:    buffer_gl0_inv
2747; GFX1164_DPP-NEXT:  .LBB6_2:
2748; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
2749; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2750; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v7
2751; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v5
2752; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v6
2753; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s4, v8
2754; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2755; GFX1164_DPP-NEXT:    v_add_co_u32 v7, vcc, s3, v9
2756; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
2757; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v8, vcc, s4, v10, vcc
2758; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2759; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
2760; GFX1164_DPP-NEXT:    s_endpgm
2761;
2762; GFX1132_DPP-LABEL: add_i64_varying:
2763; GFX1132_DPP:       ; %bb.0: ; %entry
2764; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2765; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
2766; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2767; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s2
2768; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
2769; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s2
2770; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0
2771; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, 0
2772; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2773; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2774; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2775; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2776; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
2777; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2778; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
2779; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2780; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
2781; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
2782; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2783; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2784; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
2785; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
2786; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
2787; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
2788; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2789; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
2790; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2791; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
2792; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
2793; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2794; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
2795; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2796; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2797; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
2798; GFX1132_DPP-NEXT:    v_readlane_b32 s3, v2, 15
2799; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v2, 31
2800; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2801; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
2802; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2803; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
2804; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
2805; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 31
2806; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2807; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
2808; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
2809; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
2810; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
2811; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
2812; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
2813; GFX1132_DPP-NEXT:    v_writelane_b32 v7, s6, 16
2814; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
2815; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
2816; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
2817; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
2818; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
2819; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB6_2
2820; GFX1132_DPP-NEXT:  ; %bb.1:
2821; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
2822; GFX1132_DPP-NEXT:    ds_add_rtn_u64 v[8:9], v0, v[8:9]
2823; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2824; GFX1132_DPP-NEXT:    buffer_gl0_inv
2825; GFX1132_DPP-NEXT:  .LBB6_2:
2826; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
2827; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2828; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v8
2829; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v6
2830; GFX1132_DPP-NEXT:    v_mov_b32_e32 v11, v7
2831; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v9
2832; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2833; GFX1132_DPP-NEXT:    v_add_co_u32 v8, vcc_lo, s3, v10
2834; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
2835; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo
2836; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
2837; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
2838; GFX1132_DPP-NEXT:    s_endpgm
2839entry:
2840  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2841  %zext = zext i32 %lane to i64
2842  %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel
2843  store i64 %old, ptr addrspace(1) %out
2844  ret void
2845}
2846
2847define amdgpu_kernel void @add_i64_varying_nouse() {
2848; GFX7LESS_ITERATIVE-LABEL: add_i64_varying_nouse:
2849; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
2850; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
2851; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
2852; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
2853; GFX7LESS_ITERATIVE-NEXT:  .LBB7_1: ; %ComputeLoop
2854; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2855; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[2:3]
2856; GFX7LESS_ITERATIVE-NEXT:    s_nop 0
2857; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s5, v1, s4
2858; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s4
2859; GFX7LESS_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
2860; GFX7LESS_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
2861; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
2862; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
2863; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
2864; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[4:5]
2865; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB7_1
2866; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2867; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2868; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
2869; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2870; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2871; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
2872; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB7_4
2873; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
2874; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
2875; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s0
2876; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s1
2877; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
2878; GFX7LESS_ITERATIVE-NEXT:    ds_add_u64 v2, v[0:1]
2879; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2880; GFX7LESS_ITERATIVE-NEXT:  .LBB7_4:
2881; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
2882;
2883; GFX8_ITERATIVE-LABEL: add_i64_varying_nouse:
2884; GFX8_ITERATIVE:       ; %bb.0: ; %entry
2885; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
2886; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
2887; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
2888; GFX8_ITERATIVE-NEXT:  .LBB7_1: ; %ComputeLoop
2889; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2890; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[2:3]
2891; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s4
2892; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s5, v1, s4
2893; GFX8_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
2894; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
2895; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
2896; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
2897; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
2898; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
2899; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2900; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2901; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2902; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2903; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2904; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
2905; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB7_4
2906; GFX8_ITERATIVE-NEXT:  ; %bb.3:
2907; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s0
2908; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
2909; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s1
2910; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
2911; GFX8_ITERATIVE-NEXT:    ds_add_u64 v2, v[0:1]
2912; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2913; GFX8_ITERATIVE-NEXT:  .LBB7_4:
2914; GFX8_ITERATIVE-NEXT:    s_endpgm
2915;
2916; GFX9_ITERATIVE-LABEL: add_i64_varying_nouse:
2917; GFX9_ITERATIVE:       ; %bb.0: ; %entry
2918; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
2919; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
2920; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
2921; GFX9_ITERATIVE-NEXT:  .LBB7_1: ; %ComputeLoop
2922; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2923; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[2:3]
2924; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s4
2925; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s5, v1, s4
2926; GFX9_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
2927; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
2928; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
2929; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
2930; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
2931; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
2932; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2933; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2934; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2935; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2936; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2937; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
2938; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB7_4
2939; GFX9_ITERATIVE-NEXT:  ; %bb.3:
2940; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s0
2941; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
2942; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s1
2943; GFX9_ITERATIVE-NEXT:    ds_add_u64 v2, v[0:1]
2944; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2945; GFX9_ITERATIVE-NEXT:  .LBB7_4:
2946; GFX9_ITERATIVE-NEXT:    s_endpgm
2947;
2948; GFX1064_ITERATIVE-LABEL: add_i64_varying_nouse:
2949; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
2950; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
2951; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
2952; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
2953; GFX1064_ITERATIVE-NEXT:  .LBB7_1: ; %ComputeLoop
2954; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2955; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[2:3]
2956; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s5, v0, s4
2957; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v1, s4
2958; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s5
2959; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s6
2960; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
2961; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
2962; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
2963; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
2964; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2965; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2966; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2967; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2968; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2969; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
2970; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB7_4
2971; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
2972; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s0
2973; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
2974; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s1
2975; GFX1064_ITERATIVE-NEXT:    ds_add_u64 v2, v[0:1]
2976; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
2977; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
2978; GFX1064_ITERATIVE-NEXT:  .LBB7_4:
2979; GFX1064_ITERATIVE-NEXT:    s_endpgm
2980;
2981; GFX1032_ITERATIVE-LABEL: add_i64_varying_nouse:
2982; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
2983; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
2984; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
2985; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
2986; GFX1032_ITERATIVE-NEXT:  .LBB7_1: ; %ComputeLoop
2987; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
2988; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
2989; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
2990; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s5, v1, s3
2991; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s4
2992; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
2993; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
2994; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
2995; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
2996; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
2997; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
2998; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2999; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3000; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
3001; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
3002; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB7_4
3003; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
3004; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s0
3005; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
3006; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s1
3007; GFX1032_ITERATIVE-NEXT:    ds_add_u64 v2, v[0:1]
3008; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
3009; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
3010; GFX1032_ITERATIVE-NEXT:  .LBB7_4:
3011; GFX1032_ITERATIVE-NEXT:    s_endpgm
3012;
3013; GFX1164_ITERATIVE-LABEL: add_i64_varying_nouse:
3014; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
3015; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3016; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
3017; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
3018; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
3019; GFX1164_ITERATIVE-NEXT:  .LBB7_1: ; %ComputeLoop
3020; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
3021; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s4, s[2:3]
3022; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3023; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s5, v0, s4
3024; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v1, s4
3025; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s5
3026; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3027; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s6
3028; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
3029; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
3030; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3031; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
3032; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
3033; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
3034; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3035; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
3036; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3037; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3038; GFX1164_ITERATIVE-NEXT:    v_cmpx_eq_u32_e32 0, v0
3039; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
3040; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB7_4
3041; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
3042; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s0
3043; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
3044; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s1
3045; GFX1164_ITERATIVE-NEXT:    ds_add_u64 v2, v[0:1]
3046; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
3047; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
3048; GFX1164_ITERATIVE-NEXT:  .LBB7_4:
3049; GFX1164_ITERATIVE-NEXT:    s_endpgm
3050;
3051; GFX1132_ITERATIVE-LABEL: add_i64_varying_nouse:
3052; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
3053; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
3054; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
3055; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
3056; GFX1132_ITERATIVE-NEXT:  .LBB7_1: ; %ComputeLoop
3057; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
3058; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
3059; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
3060; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
3061; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s5, v1, s3
3062; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3063; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s4
3064; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
3065; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
3066; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3067; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
3068; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
3069; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
3070; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
3071; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3072; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
3073; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3074; GFX1132_ITERATIVE-NEXT:    v_cmpx_eq_u32_e32 0, v0
3075; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
3076; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB7_4
3077; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
3078; GFX1132_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s0
3079; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
3080; GFX1132_ITERATIVE-NEXT:    ds_add_u64 v2, v[0:1]
3081; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
3082; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
3083; GFX1132_ITERATIVE-NEXT:  .LBB7_4:
3084; GFX1132_ITERATIVE-NEXT:    s_endpgm
3085;
3086; GFX7LESS_DPP-LABEL: add_i64_varying_nouse:
3087; GFX7LESS_DPP:       ; %bb.0: ; %entry
3088; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
3089; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
3090; GFX7LESS_DPP-NEXT:    ds_add_u64 v1, v[0:1]
3091; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3092; GFX7LESS_DPP-NEXT:    s_endpgm
3093;
3094; GFX8_DPP-LABEL: add_i64_varying_nouse:
3095; GFX8_DPP:       ; %bb.0: ; %entry
3096; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
3097; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
3098; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v6
3099; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
3100; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
3101; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
3102; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
3103; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
3104; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3105; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
3106; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3107; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
3108; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
3109; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
3110; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3111; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
3112; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3113; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
3114; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
3115; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
3116; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3117; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
3118; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3119; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
3120; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
3121; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
3122; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3123; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
3124; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3125; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
3126; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
3127; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
3128; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3129; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
3130; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
3131; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
3132; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
3133; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
3134; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3135; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
3136; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
3137; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
3138; GFX8_DPP-NEXT:    v_readlane_b32 s3, v1, 63
3139; GFX8_DPP-NEXT:    v_readlane_b32 s2, v2, 63
3140; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
3141; GFX8_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
3142; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
3143; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3144; GFX8_DPP-NEXT:    s_cbranch_execz .LBB7_2
3145; GFX8_DPP-NEXT:  ; %bb.1:
3146; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s1
3147; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s0
3148; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
3149; GFX8_DPP-NEXT:    ds_add_u64 v5, v[6:7]
3150; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3151; GFX8_DPP-NEXT:  .LBB7_2:
3152; GFX8_DPP-NEXT:    s_endpgm
3153;
3154; GFX9_DPP-LABEL: add_i64_varying_nouse:
3155; GFX9_DPP:       ; %bb.0: ; %entry
3156; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
3157; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
3158; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v6
3159; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
3160; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
3161; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
3162; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
3163; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
3164; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
3165; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
3166; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
3167; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
3168; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
3169; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
3170; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
3171; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
3172; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf
3173; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
3174; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
3175; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
3176; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
3177; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
3178; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
3179; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
3180; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
3181; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
3182; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
3183; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
3184; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf
3185; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
3186; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
3187; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
3188; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
3189; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
3190; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
3191; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
3192; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
3193; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
3194; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
3195; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
3196; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
3197; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
3198; GFX9_DPP-NEXT:    v_readlane_b32 s3, v1, 63
3199; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
3200; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
3201; GFX9_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
3202; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
3203; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3204; GFX9_DPP-NEXT:    s_cbranch_execz .LBB7_2
3205; GFX9_DPP-NEXT:  ; %bb.1:
3206; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s1
3207; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s0
3208; GFX9_DPP-NEXT:    ds_add_u64 v5, v[6:7]
3209; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3210; GFX9_DPP-NEXT:  .LBB7_2:
3211; GFX9_DPP-NEXT:    s_endpgm
3212;
3213; GFX1064_DPP-LABEL: add_i64_varying_nouse:
3214; GFX1064_DPP:       ; %bb.0: ; %entry
3215; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
3216; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
3217; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
3218; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
3219; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
3220; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
3221; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
3222; GFX1064_DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
3223; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
3224; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v2, v1
3225; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc
3226; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
3227; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
3228; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
3229; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_xmask:2 row_mask:0xf bank_mask:0xf
3230; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v6
3231; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
3232; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
3233; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
3234; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
3235; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf
3236; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v4
3237; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc
3238; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
3239; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_xmask:8 row_mask:0xf bank_mask:0xf
3240; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v6
3241; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
3242; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v1, 0, 0
3243; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
3244; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v3
3245; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v4, vcc
3246; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
3247; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3248; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
3249; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 0
3250; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 0
3251; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 32
3252; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v2, 32
3253; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
3254; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
3255; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
3256; GFX1064_DPP-NEXT:    s_add_u32 s0, s3, s4
3257; GFX1064_DPP-NEXT:    s_addc_u32 s1, s2, s5
3258; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
3259; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
3260; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB7_2
3261; GFX1064_DPP-NEXT:  ; %bb.1:
3262; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
3263; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
3264; GFX1064_DPP-NEXT:    ds_add_u64 v0, v[7:8]
3265; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3266; GFX1064_DPP-NEXT:    buffer_gl0_inv
3267; GFX1064_DPP-NEXT:  .LBB7_2:
3268; GFX1064_DPP-NEXT:    s_endpgm
3269;
3270; GFX1032_DPP-LABEL: add_i64_varying_nouse:
3271; GFX1032_DPP:       ; %bb.0: ; %entry
3272; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
3273; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
3274; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s0
3275; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s0
3276; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
3277; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
3278; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
3279; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
3280; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
3281; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v1
3282; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo
3283; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
3284; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
3285; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
3286; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_xmask:2 row_mask:0xf bank_mask:0xf
3287; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v6
3288; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
3289; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
3290; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
3291; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
3292; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v2 row_xmask:4 row_mask:0xf bank_mask:0xf
3293; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v4
3294; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo
3295; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
3296; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_xmask:8 row_mask:0xf bank_mask:0xf
3297; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v6
3298; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
3299; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v1, 0, 0
3300; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
3301; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
3302; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
3303; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
3304; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
3305; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
3306; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
3307; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
3308; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
3309; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
3310; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB7_2
3311; GFX1032_DPP-NEXT:  ; %bb.1:
3312; GFX1032_DPP-NEXT:    ds_add_u64 v0, v[7:8]
3313; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3314; GFX1032_DPP-NEXT:    buffer_gl0_inv
3315; GFX1032_DPP-NEXT:  .LBB7_2:
3316; GFX1032_DPP-NEXT:    s_endpgm
3317;
3318; GFX1164_DPP-LABEL: add_i64_varying_nouse:
3319; GFX1164_DPP:       ; %bb.0: ; %entry
3320; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3321; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
3322; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3323; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
3324; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
3325; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
3326; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
3327; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3328; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
3329; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3330; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3331; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
3332; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3333; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
3334; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3335; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
3336; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
3337; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3338; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3339; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
3340; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
3341; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
3342; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
3343; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3344; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3345; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
3346; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v2, 0, 0
3347; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3348; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
3349; GFX1164_DPP-NEXT:    v_add_co_u32 v2, vcc, v2, v3
3350; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3351; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v1, 0, 0
3352; GFX1164_DPP-NEXT:    v_permlane64_b32 v3, v2
3353; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3354; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
3355; GFX1164_DPP-NEXT:    v_permlane64_b32 v4, v1
3356; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
3357; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
3358; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3359; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
3360; GFX1164_DPP-NEXT:    v_add_co_u32 v2, vcc, v2, v3
3361; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc
3362; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
3363; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
3364; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
3365; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
3366; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v3
3367; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], exec
3368; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3369; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v7
3370; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB7_2
3371; GFX1164_DPP-NEXT:  ; %bb.1:
3372; GFX1164_DPP-NEXT:    ds_add_u64 v0, v[5:6]
3373; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3374; GFX1164_DPP-NEXT:    buffer_gl0_inv
3375; GFX1164_DPP-NEXT:  .LBB7_2:
3376; GFX1164_DPP-NEXT:    s_endpgm
3377;
3378; GFX1132_DPP-LABEL: add_i64_varying_nouse:
3379; GFX1132_DPP:       ; %bb.0: ; %entry
3380; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3381; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
3382; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3383; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s0
3384; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
3385; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s0
3386; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
3387; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3388; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
3389; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
3390; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3391; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
3392; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
3393; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
3394; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3395; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
3396; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
3397; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3398; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
3399; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
3400; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
3401; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
3402; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
3403; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
3404; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3405; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
3406; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v2, 0, 0
3407; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3408; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
3409; GFX1132_DPP-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v3
3410; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3411; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v1, 0, 0
3412; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo
3413; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
3414; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3415; GFX1132_DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v2
3416; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
3417; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v3
3418; GFX1132_DPP-NEXT:    s_mov_b32 s0, exec_lo
3419; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
3420; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v7
3421; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB7_2
3422; GFX1132_DPP-NEXT:  ; %bb.1:
3423; GFX1132_DPP-NEXT:    ds_add_u64 v0, v[5:6]
3424; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
3425; GFX1132_DPP-NEXT:    buffer_gl0_inv
3426; GFX1132_DPP-NEXT:  .LBB7_2:
3427; GFX1132_DPP-NEXT:    s_endpgm
3428entry:
3429  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3430  %zext = zext i32 %lane to i64
3431  %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel
3432  ret void
3433}
3434
3435define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
3436; GFX7LESS-LABEL: sub_i32_constant:
3437; GFX7LESS:       ; %bb.0: ; %entry
3438; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
3439; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3440; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
3441; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3442; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
3443; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3444; GFX7LESS-NEXT:    s_cbranch_execz .LBB8_2
3445; GFX7LESS-NEXT:  ; %bb.1:
3446; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
3447; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
3448; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3449; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
3450; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3451; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3452; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3453; GFX7LESS-NEXT:  .LBB8_2:
3454; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
3455; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3456; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3457; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3458; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
3459; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3460; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
3461; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3462; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3463; GFX7LESS-NEXT:    s_endpgm
3464;
3465; GFX8-LABEL: sub_i32_constant:
3466; GFX8:       ; %bb.0: ; %entry
3467; GFX8-NEXT:    s_mov_b64 s[2:3], exec
3468; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3469; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3470; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3471; GFX8-NEXT:    ; implicit-def: $vgpr1
3472; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3473; GFX8-NEXT:    s_cbranch_execz .LBB8_2
3474; GFX8-NEXT:  ; %bb.1:
3475; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
3476; GFX8-NEXT:    s_mul_i32 s2, s2, 5
3477; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3478; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3479; GFX8-NEXT:    s_mov_b32 m0, -1
3480; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3481; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3482; GFX8-NEXT:  .LBB8_2:
3483; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
3484; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3485; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
3486; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3487; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3488; GFX8-NEXT:    s_mov_b32 s2, -1
3489; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
3490; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3491; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3492; GFX8-NEXT:    s_endpgm
3493;
3494; GFX9-LABEL: sub_i32_constant:
3495; GFX9:       ; %bb.0: ; %entry
3496; GFX9-NEXT:    s_mov_b64 s[2:3], exec
3497; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3498; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3499; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3500; GFX9-NEXT:    ; implicit-def: $vgpr1
3501; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3502; GFX9-NEXT:    s_cbranch_execz .LBB8_2
3503; GFX9-NEXT:  ; %bb.1:
3504; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
3505; GFX9-NEXT:    s_mul_i32 s2, s2, 5
3506; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3507; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3508; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3509; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3510; GFX9-NEXT:  .LBB8_2:
3511; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
3512; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3513; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
3514; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3515; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3516; GFX9-NEXT:    s_mov_b32 s2, -1
3517; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
3518; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3519; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3520; GFX9-NEXT:    s_endpgm
3521;
3522; GFX1064-LABEL: sub_i32_constant:
3523; GFX1064:       ; %bb.0: ; %entry
3524; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
3525; GFX1064-NEXT:    ; implicit-def: $vgpr1
3526; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3527; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3528; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3529; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3530; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
3531; GFX1064-NEXT:  ; %bb.1:
3532; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
3533; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3534; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
3535; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
3536; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3537; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3538; GFX1064-NEXT:    buffer_gl0_inv
3539; GFX1064-NEXT:  .LBB8_2:
3540; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3541; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
3542; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3543; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
3544; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3545; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3546; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
3547; GFX1064-NEXT:    s_mov_b32 s2, -1
3548; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3549; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3550; GFX1064-NEXT:    s_endpgm
3551;
3552; GFX1032-LABEL: sub_i32_constant:
3553; GFX1032:       ; %bb.0: ; %entry
3554; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
3555; GFX1032-NEXT:    ; implicit-def: $vgpr1
3556; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
3557; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3558; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
3559; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
3560; GFX1032-NEXT:  ; %bb.1:
3561; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
3562; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3563; GFX1032-NEXT:    s_mul_i32 s1, s1, 5
3564; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
3565; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3566; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3567; GFX1032-NEXT:    buffer_gl0_inv
3568; GFX1032-NEXT:  .LBB8_2:
3569; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3570; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3571; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3572; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
3573; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3574; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
3575; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
3576; GFX1032-NEXT:    s_mov_b32 s2, -1
3577; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3578; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3579; GFX1032-NEXT:    s_endpgm
3580;
3581; GFX1164-LABEL: sub_i32_constant:
3582; GFX1164:       ; %bb.0: ; %entry
3583; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
3584; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
3585; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3586; GFX1164-NEXT:    ; implicit-def: $vgpr1
3587; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3588; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3589; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
3590; GFX1164-NEXT:    s_cbranch_execz .LBB8_2
3591; GFX1164-NEXT:  ; %bb.1:
3592; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
3593; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
3594; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
3595; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3596; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
3597; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3598; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3599; GFX1164-NEXT:    buffer_gl0_inv
3600; GFX1164-NEXT:  .LBB8_2:
3601; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
3602; GFX1164-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
3603; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
3604; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3605; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3606; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3607; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
3608; GFX1164-NEXT:    s_mov_b32 s2, -1
3609; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3610; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3611; GFX1164-NEXT:    s_endpgm
3612;
3613; GFX1132-LABEL: sub_i32_constant:
3614; GFX1132:       ; %bb.0: ; %entry
3615; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
3616; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
3617; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
3618; GFX1132-NEXT:    ; implicit-def: $vgpr1
3619; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3620; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
3621; GFX1132-NEXT:    s_cbranch_execz .LBB8_2
3622; GFX1132-NEXT:  ; %bb.1:
3623; GFX1132-NEXT:    s_bcnt1_i32_b32 s1, s1
3624; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3625; GFX1132-NEXT:    s_mul_i32 s1, s1, 5
3626; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1
3627; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3628; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3629; GFX1132-NEXT:    buffer_gl0_inv
3630; GFX1132-NEXT:  .LBB8_2:
3631; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3632; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
3633; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
3634; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
3635; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
3636; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3637; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
3638; GFX1132-NEXT:    s_mov_b32 s2, -1
3639; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3640; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3641; GFX1132-NEXT:    s_endpgm
3642entry:
3643  %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 5 acq_rel
3644  store i32 %old, ptr addrspace(1) %out
3645  ret void
3646}
3647
3648define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) {
3649; GFX7LESS-LABEL: sub_i32_uniform:
3650; GFX7LESS:       ; %bb.0: ; %entry
3651; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
3652; GFX7LESS-NEXT:    s_load_dword s6, s[4:5], 0xb
3653; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
3654; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
3655; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3656; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
3657; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3658; GFX7LESS-NEXT:    s_cbranch_execz .LBB9_2
3659; GFX7LESS-NEXT:  ; %bb.1:
3660; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
3661; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3662; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
3663; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3664; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
3665; GFX7LESS-NEXT:    s_mov_b32 m0, -1
3666; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3667; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3668; GFX7LESS-NEXT:  .LBB9_2:
3669; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
3670; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3671; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
3672; GFX7LESS-NEXT:    s_mov_b32 s2, -1
3673; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
3674; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3675; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
3676; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
3677; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3678; GFX7LESS-NEXT:    s_endpgm
3679;
3680; GFX8-LABEL: sub_i32_uniform:
3681; GFX8:       ; %bb.0: ; %entry
3682; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x2c
3683; GFX8-NEXT:    s_mov_b64 s[2:3], exec
3684; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3685; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3686; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3687; GFX8-NEXT:    ; implicit-def: $vgpr1
3688; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3689; GFX8-NEXT:    s_cbranch_execz .LBB9_2
3690; GFX8-NEXT:  ; %bb.1:
3691; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
3692; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3693; GFX8-NEXT:    s_mul_i32 s2, s6, s2
3694; GFX8-NEXT:    v_mov_b32_e32 v1, 0
3695; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3696; GFX8-NEXT:    s_mov_b32 m0, -1
3697; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3698; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3699; GFX8-NEXT:  .LBB9_2:
3700; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
3701; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3702; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3703; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
3704; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
3705; GFX8-NEXT:    s_mov_b32 s3, 0xf000
3706; GFX8-NEXT:    s_mov_b32 s2, -1
3707; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
3708; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3709; GFX8-NEXT:    s_endpgm
3710;
3711; GFX9-LABEL: sub_i32_uniform:
3712; GFX9:       ; %bb.0: ; %entry
3713; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
3714; GFX9-NEXT:    s_mov_b64 s[2:3], exec
3715; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3716; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3717; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3718; GFX9-NEXT:    ; implicit-def: $vgpr1
3719; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3720; GFX9-NEXT:    s_cbranch_execz .LBB9_2
3721; GFX9-NEXT:  ; %bb.1:
3722; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
3723; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3724; GFX9-NEXT:    s_mul_i32 s2, s6, s2
3725; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3726; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3727; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3728; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3729; GFX9-NEXT:  .LBB9_2:
3730; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
3731; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3732; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3733; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
3734; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
3735; GFX9-NEXT:    s_mov_b32 s3, 0xf000
3736; GFX9-NEXT:    s_mov_b32 s2, -1
3737; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
3738; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3739; GFX9-NEXT:    s_endpgm
3740;
3741; GFX1064-LABEL: sub_i32_uniform:
3742; GFX1064:       ; %bb.0: ; %entry
3743; GFX1064-NEXT:    s_load_dword s6, s[4:5], 0x2c
3744; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
3745; GFX1064-NEXT:    ; implicit-def: $vgpr1
3746; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3747; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3748; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3749; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3750; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
3751; GFX1064-NEXT:  ; %bb.1:
3752; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
3753; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
3754; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3755; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
3756; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
3757; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3758; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3759; GFX1064-NEXT:    buffer_gl0_inv
3760; GFX1064-NEXT:  .LBB9_2:
3761; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
3762; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
3763; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3764; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
3765; GFX1064-NEXT:    v_mul_lo_u32 v0, s6, v0
3766; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
3767; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
3768; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
3769; GFX1064-NEXT:    s_mov_b32 s2, -1
3770; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3771; GFX1064-NEXT:    s_endpgm
3772;
3773; GFX1032-LABEL: sub_i32_uniform:
3774; GFX1032:       ; %bb.0: ; %entry
3775; GFX1032-NEXT:    s_load_dword s0, s[4:5], 0x2c
3776; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
3777; GFX1032-NEXT:    ; implicit-def: $vgpr1
3778; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3779; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
3780; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
3781; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
3782; GFX1032-NEXT:  ; %bb.1:
3783; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s2
3784; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
3785; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3786; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
3787; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
3788; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3789; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3790; GFX1032-NEXT:    buffer_gl0_inv
3791; GFX1032-NEXT:  .LBB9_2:
3792; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
3793; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
3794; GFX1032-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
3795; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
3796; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
3797; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
3798; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
3799; GFX1032-NEXT:    s_mov_b32 s10, -1
3800; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
3801; GFX1032-NEXT:    buffer_store_dword v0, off, s[8:11], 0
3802; GFX1032-NEXT:    s_endpgm
3803;
3804; GFX1164-LABEL: sub_i32_uniform:
3805; GFX1164:       ; %bb.0: ; %entry
3806; GFX1164-NEXT:    s_load_b32 s6, s[4:5], 0x2c
3807; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
3808; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
3809; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3810; GFX1164-NEXT:    ; implicit-def: $vgpr1
3811; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3812; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
3813; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
3814; GFX1164-NEXT:    s_cbranch_execz .LBB9_2
3815; GFX1164-NEXT:  ; %bb.1:
3816; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
3817; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
3818; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3819; GFX1164-NEXT:    s_mul_i32 s2, s6, s2
3820; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3821; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
3822; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3823; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3824; GFX1164-NEXT:    buffer_gl0_inv
3825; GFX1164-NEXT:  .LBB9_2:
3826; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
3827; GFX1164-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
3828; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3829; GFX1164-NEXT:    v_mul_lo_u32 v0, s6, v0
3830; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
3831; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
3832; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3833; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
3834; GFX1164-NEXT:    s_mov_b32 s2, -1
3835; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
3836; GFX1164-NEXT:    s_endpgm
3837;
3838; GFX1132-LABEL: sub_i32_uniform:
3839; GFX1132:       ; %bb.0: ; %entry
3840; GFX1132-NEXT:    s_load_b32 s0, s[4:5], 0x2c
3841; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
3842; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
3843; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
3844; GFX1132-NEXT:    ; implicit-def: $vgpr1
3845; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3846; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
3847; GFX1132-NEXT:    s_cbranch_execz .LBB9_2
3848; GFX1132-NEXT:  ; %bb.1:
3849; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
3850; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3851; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
3852; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3853; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
3854; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
3855; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3856; GFX1132-NEXT:    buffer_gl0_inv
3857; GFX1132-NEXT:  .LBB9_2:
3858; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
3859; GFX1132-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
3860; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3861; GFX1132-NEXT:    v_mul_lo_u32 v0, s0, v0
3862; GFX1132-NEXT:    v_readfirstlane_b32 s0, v1
3863; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
3864; GFX1132-NEXT:    s_mov_b32 s6, -1
3865; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3866; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
3867; GFX1132-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
3868; GFX1132-NEXT:    s_endpgm
3869entry:
3870  %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %subitive acq_rel
3871  store i32 %old, ptr addrspace(1) %out
3872  ret void
3873}
3874
3875define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
3876; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying:
3877; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
3878; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
3879; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, 0
3880; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
3881; GFX7LESS_ITERATIVE-NEXT:  .LBB10_1: ; %ComputeLoop
3882; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
3883; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
3884; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s3
3885; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
3886; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
3887; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
3888; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
3889; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
3890; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
3891; GFX7LESS_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
3892; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB10_1
3893; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
3894; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3895; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3896; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3897; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
3898; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3899; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
3900; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB10_4
3901; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
3902; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
3903; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
3904; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
3905; GFX7LESS_ITERATIVE-NEXT:    ds_sub_rtn_u32 v0, v0, v2
3906; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
3907; GFX7LESS_ITERATIVE-NEXT:  .LBB10_4:
3908; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
3909; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3910; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
3911; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
3912; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
3913; GFX7LESS_ITERATIVE-NEXT:    v_sub_i32_e32 v0, vcc, s4, v1
3914; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
3915; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3916; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
3917;
3918; GFX8_ITERATIVE-LABEL: sub_i32_varying:
3919; GFX8_ITERATIVE:       ; %bb.0: ; %entry
3920; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
3921; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, 0
3922; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
3923; GFX8_ITERATIVE-NEXT:  .LBB10_1: ; %ComputeLoop
3924; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
3925; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
3926; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
3927; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
3928; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
3929; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
3930; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
3931; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
3932; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
3933; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
3934; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
3935; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3936; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3937; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3938; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
3939; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3940; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
3941; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB10_4
3942; GFX8_ITERATIVE-NEXT:  ; %bb.3:
3943; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
3944; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
3945; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
3946; GFX8_ITERATIVE-NEXT:    ds_sub_rtn_u32 v0, v0, v2
3947; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
3948; GFX8_ITERATIVE-NEXT:  .LBB10_4:
3949; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
3950; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3951; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
3952; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
3953; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
3954; GFX8_ITERATIVE-NEXT:    v_sub_u32_e32 v0, vcc, s4, v1
3955; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
3956; GFX8_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3957; GFX8_ITERATIVE-NEXT:    s_endpgm
3958;
3959; GFX9_ITERATIVE-LABEL: sub_i32_varying:
3960; GFX9_ITERATIVE:       ; %bb.0: ; %entry
3961; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
3962; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, 0
3963; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
3964; GFX9_ITERATIVE-NEXT:  .LBB10_1: ; %ComputeLoop
3965; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
3966; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
3967; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
3968; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
3969; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
3970; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
3971; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
3972; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
3973; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
3974; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
3975; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
3976; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3977; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3978; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
3979; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
3980; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3981; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
3982; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB10_4
3983; GFX9_ITERATIVE-NEXT:  ; %bb.3:
3984; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
3985; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
3986; GFX9_ITERATIVE-NEXT:    ds_sub_rtn_u32 v0, v0, v2
3987; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
3988; GFX9_ITERATIVE-NEXT:  .LBB10_4:
3989; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
3990; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3991; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
3992; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
3993; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
3994; GFX9_ITERATIVE-NEXT:    v_sub_u32_e32 v0, s4, v1
3995; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
3996; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3997; GFX9_ITERATIVE-NEXT:    s_endpgm
3998;
3999; GFX1064_ITERATIVE-LABEL: sub_i32_varying:
4000; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
4001; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4002; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, 0
4003; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
4004; GFX1064_ITERATIVE-NEXT:  .LBB10_1: ; %ComputeLoop
4005; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4006; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
4007; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
4008; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
4009; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
4010; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
4011; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
4012; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
4013; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
4014; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4015; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4016; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4017; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4018; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
4019; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4020; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
4021; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB10_4
4022; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
4023; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
4024; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
4025; GFX1064_ITERATIVE-NEXT:    ds_sub_rtn_u32 v0, v0, v2
4026; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4027; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
4028; GFX1064_ITERATIVE-NEXT:  .LBB10_4:
4029; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
4030; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
4031; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4032; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
4033; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
4034; GFX1064_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
4035; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
4036; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4037; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4038; GFX1064_ITERATIVE-NEXT:    s_endpgm
4039;
4040; GFX1032_ITERATIVE-LABEL: sub_i32_varying:
4041; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
4042; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
4043; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, 0
4044; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
4045; GFX1032_ITERATIVE-NEXT:  .LBB10_1: ; %ComputeLoop
4046; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4047; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
4048; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
4049; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
4050; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
4051; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
4052; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
4053; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
4054; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
4055; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4056; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4057; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4058; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
4059; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
4060; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
4061; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB10_4
4062; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
4063; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
4064; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
4065; GFX1032_ITERATIVE-NEXT:    ds_sub_rtn_u32 v0, v0, v2
4066; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4067; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
4068; GFX1032_ITERATIVE-NEXT:  .LBB10_4:
4069; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
4070; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4071; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4072; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
4073; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
4074; GFX1032_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
4075; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
4076; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4077; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4078; GFX1032_ITERATIVE-NEXT:    s_endpgm
4079;
4080; GFX1164_ITERATIVE-LABEL: sub_i32_varying:
4081; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
4082; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
4083; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4084; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, 0
4085; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
4086; GFX1164_ITERATIVE-NEXT:  .LBB10_1: ; %ComputeLoop
4087; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4088; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
4089; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
4090; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
4091; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
4092; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
4093; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
4094; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4095; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
4096; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
4097; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
4098; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4099; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
4100; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4101; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
4102; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
4103; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
4104; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4105; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4106; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
4107; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB10_4
4108; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
4109; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
4110; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
4111; GFX1164_ITERATIVE-NEXT:    ds_sub_rtn_u32 v1, v1, v2
4112; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4113; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
4114; GFX1164_ITERATIVE-NEXT:  .LBB10_4:
4115; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
4116; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
4117; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
4118; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
4119; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4120; GFX1164_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4121; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
4122; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4123; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4124; GFX1164_ITERATIVE-NEXT:    s_endpgm
4125;
4126; GFX1132_ITERATIVE-LABEL: sub_i32_varying:
4127; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
4128; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
4129; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
4130; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, 0
4131; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
4132; GFX1132_ITERATIVE-NEXT:  .LBB10_1: ; %ComputeLoop
4133; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4134; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
4135; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
4136; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
4137; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
4138; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
4139; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
4140; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4141; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
4142; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
4143; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
4144; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4145; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
4146; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4147; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
4148; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
4149; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
4150; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
4151; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB10_4
4152; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
4153; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
4154; GFX1132_ITERATIVE-NEXT:    ds_sub_rtn_u32 v1, v1, v2
4155; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4156; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
4157; GFX1132_ITERATIVE-NEXT:  .LBB10_4:
4158; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4159; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
4160; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
4161; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
4162; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4163; GFX1132_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
4164; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
4165; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4166; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4167; GFX1132_ITERATIVE-NEXT:    s_endpgm
4168;
4169; GFX7LESS_DPP-LABEL: sub_i32_varying:
4170; GFX7LESS_DPP:       ; %bb.0: ; %entry
4171; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4172; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
4173; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
4174; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4175; GFX7LESS_DPP-NEXT:    ds_sub_rtn_u32 v0, v1, v0
4176; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4177; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
4178; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
4179; GFX7LESS_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4180; GFX7LESS_DPP-NEXT:    s_endpgm
4181;
4182; GFX8_DPP-LABEL: sub_i32_varying:
4183; GFX8_DPP:       ; %bb.0: ; %entry
4184; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
4185; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
4186; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
4187; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4188; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
4189; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
4190; GFX8_DPP-NEXT:    s_nop 0
4191; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4192; GFX8_DPP-NEXT:    s_nop 1
4193; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4194; GFX8_DPP-NEXT:    s_nop 1
4195; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4196; GFX8_DPP-NEXT:    s_nop 1
4197; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4198; GFX8_DPP-NEXT:    s_nop 1
4199; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
4200; GFX8_DPP-NEXT:    s_nop 1
4201; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
4202; GFX8_DPP-NEXT:    v_readlane_b32 s2, v1, 63
4203; GFX8_DPP-NEXT:    s_nop 0
4204; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
4205; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4206; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
4207; GFX8_DPP-NEXT:    ; implicit-def: $vgpr0
4208; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4209; GFX8_DPP-NEXT:    s_cbranch_execz .LBB10_2
4210; GFX8_DPP-NEXT:  ; %bb.1:
4211; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s2
4212; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
4213; GFX8_DPP-NEXT:    ds_sub_rtn_u32 v0, v3, v0
4214; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4215; GFX8_DPP-NEXT:  .LBB10_2:
4216; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
4217; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4218; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v0
4219; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, v2
4220; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
4221; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
4222; GFX8_DPP-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
4223; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4224; GFX8_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4225; GFX8_DPP-NEXT:    s_endpgm
4226;
4227; GFX9_DPP-LABEL: sub_i32_varying:
4228; GFX9_DPP:       ; %bb.0: ; %entry
4229; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
4230; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
4231; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
4232; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4233; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
4234; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
4235; GFX9_DPP-NEXT:    s_nop 0
4236; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4237; GFX9_DPP-NEXT:    s_nop 1
4238; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4239; GFX9_DPP-NEXT:    s_nop 1
4240; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4241; GFX9_DPP-NEXT:    s_nop 1
4242; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4243; GFX9_DPP-NEXT:    s_nop 1
4244; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
4245; GFX9_DPP-NEXT:    s_nop 1
4246; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
4247; GFX9_DPP-NEXT:    v_readlane_b32 s2, v1, 63
4248; GFX9_DPP-NEXT:    s_nop 0
4249; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
4250; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4251; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
4252; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
4253; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4254; GFX9_DPP-NEXT:    s_cbranch_execz .LBB10_2
4255; GFX9_DPP-NEXT:  ; %bb.1:
4256; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s2
4257; GFX9_DPP-NEXT:    ds_sub_rtn_u32 v0, v3, v0
4258; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4259; GFX9_DPP-NEXT:  .LBB10_2:
4260; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
4261; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4262; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
4263; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v2
4264; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
4265; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
4266; GFX9_DPP-NEXT:    v_sub_u32_e32 v0, s4, v0
4267; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4268; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4269; GFX9_DPP-NEXT:    s_endpgm
4270;
4271; GFX1064_DPP-LABEL: sub_i32_varying:
4272; GFX1064_DPP:       ; %bb.0: ; %entry
4273; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4274; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
4275; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
4276; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4277; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4278; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4279; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4280; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
4281; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4282; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
4283; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s2
4284; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4285; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4286; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 15
4287; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 31
4288; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 16
4289; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4290; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4291; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4292; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 47
4293; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 63
4294; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s3, 32
4295; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4296; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4297; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
4298; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4299; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 48
4300; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4301; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4302; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
4303; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
4304; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4305; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB10_2
4306; GFX1064_DPP-NEXT:  ; %bb.1:
4307; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s6
4308; GFX1064_DPP-NEXT:    s_mov_b32 s3, s6
4309; GFX1064_DPP-NEXT:    ds_sub_rtn_u32 v0, v4, v0
4310; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4311; GFX1064_DPP-NEXT:    buffer_gl0_inv
4312; GFX1064_DPP-NEXT:  .LBB10_2:
4313; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
4314; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
4315; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4316; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v0
4317; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
4318; GFX1064_DPP-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
4319; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
4320; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4321; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4322; GFX1064_DPP-NEXT:    s_endpgm
4323;
4324; GFX1032_DPP-LABEL: sub_i32_varying:
4325; GFX1032_DPP:       ; %bb.0: ; %entry
4326; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
4327; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
4328; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
4329; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4330; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4331; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4332; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4333; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
4334; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4335; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
4336; GFX1032_DPP-NEXT:    v_readlane_b32 s2, v1, 31
4337; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4338; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
4339; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4340; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
4341; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
4342; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
4343; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
4344; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
4345; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
4346; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4347; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
4348; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
4349; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB10_2
4350; GFX1032_DPP-NEXT:  ; %bb.1:
4351; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s0
4352; GFX1032_DPP-NEXT:    ds_sub_rtn_u32 v0, v4, v0
4353; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4354; GFX1032_DPP-NEXT:    buffer_gl0_inv
4355; GFX1032_DPP-NEXT:  .LBB10_2:
4356; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
4357; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4358; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4359; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v0
4360; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
4361; GFX1032_DPP-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
4362; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
4363; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4364; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4365; GFX1032_DPP-NEXT:    s_endpgm
4366;
4367; GFX1164_DPP-LABEL: sub_i32_varying:
4368; GFX1164_DPP:       ; %bb.0: ; %entry
4369; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4370; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4371; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
4372; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
4373; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
4374; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
4375; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4376; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4377; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4378; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4379; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4380; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4381; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
4382; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4383; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4384; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
4385; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s2
4386; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4387; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
4388; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4389; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 15
4390; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 31
4391; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4392; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 16
4393; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4394; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4395; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4396; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 47
4397; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 63
4398; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s3, 32
4399; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4400; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4401; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4402; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
4403; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4404; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 48
4405; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4406; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4407; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
4408; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr0
4409; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4410; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB10_2
4411; GFX1164_DPP-NEXT:  ; %bb.1:
4412; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, s6
4413; GFX1164_DPP-NEXT:    s_mov_b32 s3, s6
4414; GFX1164_DPP-NEXT:    ds_sub_rtn_u32 v0, v4, v0
4415; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4416; GFX1164_DPP-NEXT:    buffer_gl0_inv
4417; GFX1164_DPP-NEXT:  .LBB10_2:
4418; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
4419; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
4420; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v0
4421; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, v3
4422; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4423; GFX1164_DPP-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
4424; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
4425; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4426; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4427; GFX1164_DPP-NEXT:    s_endpgm
4428;
4429; GFX1132_DPP-LABEL: sub_i32_varying:
4430; GFX1132_DPP:       ; %bb.0: ; %entry
4431; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4432; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
4433; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
4434; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
4435; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
4436; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
4437; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4438; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4439; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4440; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4441; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4442; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4443; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
4444; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
4445; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
4446; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
4447; GFX1132_DPP-NEXT:    v_readlane_b32 s2, v1, 31
4448; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
4449; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
4450; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4451; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
4452; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
4453; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
4454; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
4455; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
4456; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
4457; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4458; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
4459; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
4460; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB10_2
4461; GFX1132_DPP-NEXT:  ; %bb.1:
4462; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
4463; GFX1132_DPP-NEXT:    ds_sub_rtn_u32 v0, v4, v0
4464; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4465; GFX1132_DPP-NEXT:    buffer_gl0_inv
4466; GFX1132_DPP-NEXT:  .LBB10_2:
4467; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4468; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
4469; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
4470; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
4471; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4472; GFX1132_DPP-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
4473; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
4474; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4475; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
4476; GFX1132_DPP-NEXT:    s_endpgm
4477entry:
4478  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4479  %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel
4480  store i32 %old, ptr addrspace(1) %out
4481  ret void
4482}
4483
4484define amdgpu_kernel void @sub_i32_varying_nouse() {
4485; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying_nouse:
4486; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
4487; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4488; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, 0
4489; GFX7LESS_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
4490; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4491; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
4492; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
4493; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
4494; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
4495; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
4496; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[4:5]
4497; GFX7LESS_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
4498; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB11_1
4499; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4500; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
4501; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
4502; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4503; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4504; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
4505; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
4506; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
4507; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
4508; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s2
4509; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
4510; GFX7LESS_ITERATIVE-NEXT:    ds_sub_u32 v0, v1
4511; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4512; GFX7LESS_ITERATIVE-NEXT:  .LBB11_4:
4513; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
4514;
4515; GFX8_ITERATIVE-LABEL: sub_i32_varying_nouse:
4516; GFX8_ITERATIVE:       ; %bb.0: ; %entry
4517; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4518; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, 0
4519; GFX8_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
4520; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4521; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
4522; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
4523; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
4524; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
4525; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
4526; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
4527; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
4528; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4529; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4530; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4531; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4532; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4533; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
4534; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
4535; GFX8_ITERATIVE-NEXT:  ; %bb.3:
4536; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
4537; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s2
4538; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
4539; GFX8_ITERATIVE-NEXT:    ds_sub_u32 v0, v1
4540; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4541; GFX8_ITERATIVE-NEXT:  .LBB11_4:
4542; GFX8_ITERATIVE-NEXT:    s_endpgm
4543;
4544; GFX9_ITERATIVE-LABEL: sub_i32_varying_nouse:
4545; GFX9_ITERATIVE:       ; %bb.0: ; %entry
4546; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4547; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, 0
4548; GFX9_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
4549; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4550; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
4551; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
4552; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
4553; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
4554; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
4555; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
4556; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
4557; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4558; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4559; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4560; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4561; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4562; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
4563; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
4564; GFX9_ITERATIVE-NEXT:  ; %bb.3:
4565; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
4566; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s2
4567; GFX9_ITERATIVE-NEXT:    ds_sub_u32 v0, v1
4568; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4569; GFX9_ITERATIVE-NEXT:  .LBB11_4:
4570; GFX9_ITERATIVE-NEXT:    s_endpgm
4571;
4572; GFX1064_ITERATIVE-LABEL: sub_i32_varying_nouse:
4573; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
4574; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4575; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, 0
4576; GFX1064_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
4577; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4578; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
4579; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
4580; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
4581; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
4582; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
4583; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
4584; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
4585; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4586; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4587; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4588; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
4589; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4590; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
4591; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
4592; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
4593; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
4594; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s2
4595; GFX1064_ITERATIVE-NEXT:    ds_sub_u32 v0, v1
4596; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4597; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
4598; GFX1064_ITERATIVE-NEXT:  .LBB11_4:
4599; GFX1064_ITERATIVE-NEXT:    s_endpgm
4600;
4601; GFX1032_ITERATIVE-LABEL: sub_i32_varying_nouse:
4602; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
4603; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
4604; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, 0
4605; GFX1032_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
4606; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4607; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
4608; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
4609; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
4610; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
4611; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
4612; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
4613; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
4614; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4615; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4616; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
4617; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
4618; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
4619; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
4620; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
4621; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
4622; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s0
4623; GFX1032_ITERATIVE-NEXT:    ds_sub_u32 v0, v1
4624; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4625; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
4626; GFX1032_ITERATIVE-NEXT:  .LBB11_4:
4627; GFX1032_ITERATIVE-NEXT:    s_endpgm
4628;
4629; GFX1164_ITERATIVE-LABEL: sub_i32_varying_nouse:
4630; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
4631; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4632; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4633; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, 0
4634; GFX1164_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
4635; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4636; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
4637; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
4638; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
4639; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
4640; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4641; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
4642; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
4643; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
4644; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
4645; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4646; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4647; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
4648; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4649; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4650; GFX1164_ITERATIVE-NEXT:    v_cmpx_eq_u32_e32 0, v0
4651; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
4652; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
4653; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
4654; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
4655; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s2
4656; GFX1164_ITERATIVE-NEXT:    ds_sub_u32 v0, v1
4657; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4658; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
4659; GFX1164_ITERATIVE-NEXT:  .LBB11_4:
4660; GFX1164_ITERATIVE-NEXT:    s_endpgm
4661;
4662; GFX1132_ITERATIVE-LABEL: sub_i32_varying_nouse:
4663; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
4664; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4665; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
4666; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, 0
4667; GFX1132_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
4668; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
4669; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
4670; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
4671; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
4672; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
4673; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4674; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
4675; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
4676; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
4677; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
4678; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
4679; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4680; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
4681; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4682; GFX1132_ITERATIVE-NEXT:    v_cmpx_eq_u32_e32 0, v0
4683; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
4684; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
4685; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
4686; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
4687; GFX1132_ITERATIVE-NEXT:    ds_sub_u32 v0, v1
4688; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
4689; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
4690; GFX1132_ITERATIVE-NEXT:  .LBB11_4:
4691; GFX1132_ITERATIVE-NEXT:    s_endpgm
4692;
4693; GFX7LESS_DPP-LABEL: sub_i32_varying_nouse:
4694; GFX7LESS_DPP:       ; %bb.0: ; %entry
4695; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
4696; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
4697; GFX7LESS_DPP-NEXT:    ds_sub_u32 v1, v0
4698; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4699; GFX7LESS_DPP-NEXT:    s_endpgm
4700;
4701; GFX8_DPP-LABEL: sub_i32_varying_nouse:
4702; GFX8_DPP:       ; %bb.0: ; %entry
4703; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
4704; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4705; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4706; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4707; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
4708; GFX8_DPP-NEXT:    s_nop 1
4709; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4710; GFX8_DPP-NEXT:    s_nop 1
4711; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4712; GFX8_DPP-NEXT:    s_nop 1
4713; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4714; GFX8_DPP-NEXT:    s_nop 1
4715; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4716; GFX8_DPP-NEXT:    s_nop 1
4717; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
4718; GFX8_DPP-NEXT:    s_nop 1
4719; GFX8_DPP-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
4720; GFX8_DPP-NEXT:    v_readlane_b32 s2, v1, 63
4721; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4722; GFX8_DPP-NEXT:    s_mov_b32 s0, s2
4723; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4724; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4725; GFX8_DPP-NEXT:    s_cbranch_execz .LBB11_2
4726; GFX8_DPP-NEXT:  ; %bb.1:
4727; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s0
4728; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
4729; GFX8_DPP-NEXT:    ds_sub_u32 v2, v0
4730; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4731; GFX8_DPP-NEXT:  .LBB11_2:
4732; GFX8_DPP-NEXT:    s_endpgm
4733;
4734; GFX9_DPP-LABEL: sub_i32_varying_nouse:
4735; GFX9_DPP:       ; %bb.0: ; %entry
4736; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
4737; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
4738; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
4739; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4740; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
4741; GFX9_DPP-NEXT:    s_nop 1
4742; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4743; GFX9_DPP-NEXT:    s_nop 1
4744; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4745; GFX9_DPP-NEXT:    s_nop 1
4746; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4747; GFX9_DPP-NEXT:    s_nop 1
4748; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4749; GFX9_DPP-NEXT:    s_nop 1
4750; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
4751; GFX9_DPP-NEXT:    s_nop 1
4752; GFX9_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
4753; GFX9_DPP-NEXT:    v_readlane_b32 s2, v1, 63
4754; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4755; GFX9_DPP-NEXT:    s_mov_b32 s0, s2
4756; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4757; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4758; GFX9_DPP-NEXT:    s_cbranch_execz .LBB11_2
4759; GFX9_DPP-NEXT:  ; %bb.1:
4760; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s0
4761; GFX9_DPP-NEXT:    ds_sub_u32 v2, v0
4762; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4763; GFX9_DPP-NEXT:  .LBB11_2:
4764; GFX9_DPP-NEXT:    s_endpgm
4765;
4766; GFX1064_DPP-LABEL: sub_i32_varying_nouse:
4767; GFX1064_DPP:       ; %bb.0: ; %entry
4768; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4769; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
4770; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4771; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4772; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4773; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4774; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
4775; GFX1064_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
4776; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4777; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4778; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4779; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 0
4780; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 32
4781; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4782; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
4783; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
4784; GFX1064_DPP-NEXT:    s_add_i32 s0, s2, s3
4785; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
4786; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
4787; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB11_2
4788; GFX1064_DPP-NEXT:  ; %bb.1:
4789; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, s0
4790; GFX1064_DPP-NEXT:    ds_sub_u32 v0, v3
4791; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4792; GFX1064_DPP-NEXT:    buffer_gl0_inv
4793; GFX1064_DPP-NEXT:  .LBB11_2:
4794; GFX1064_DPP-NEXT:    s_endpgm
4795;
4796; GFX1032_DPP-LABEL: sub_i32_varying_nouse:
4797; GFX1032_DPP:       ; %bb.0: ; %entry
4798; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
4799; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
4800; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4801; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4802; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4803; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4804; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
4805; GFX1032_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
4806; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
4807; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
4808; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
4809; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v1
4810; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
4811; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
4812; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB11_2
4813; GFX1032_DPP-NEXT:  ; %bb.1:
4814; GFX1032_DPP-NEXT:    ds_sub_u32 v0, v3
4815; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4816; GFX1032_DPP-NEXT:    buffer_gl0_inv
4817; GFX1032_DPP-NEXT:  .LBB11_2:
4818; GFX1032_DPP-NEXT:    s_endpgm
4819;
4820; GFX1164_DPP-LABEL: sub_i32_varying_nouse:
4821; GFX1164_DPP:       ; %bb.0: ; %entry
4822; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4823; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4824; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
4825; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
4826; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4827; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4828; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4829; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4830; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4831; GFX1164_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4832; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4833; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
4834; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
4835; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4836; GFX1164_DPP-NEXT:    v_permlane64_b32 v2, v1
4837; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4838; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4839; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
4840; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfffe
4841; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
4842; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
4843; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
4844; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
4845; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
4846; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v1
4847; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], exec
4848; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
4849; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
4850; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB11_2
4851; GFX1164_DPP-NEXT:  ; %bb.1:
4852; GFX1164_DPP-NEXT:    ds_sub_u32 v0, v3
4853; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4854; GFX1164_DPP-NEXT:    buffer_gl0_inv
4855; GFX1164_DPP-NEXT:  .LBB11_2:
4856; GFX1164_DPP-NEXT:    s_endpgm
4857;
4858; GFX1132_DPP-LABEL: sub_i32_varying_nouse:
4859; GFX1132_DPP:       ; %bb.0: ; %entry
4860; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4861; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
4862; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
4863; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
4864; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4865; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
4866; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
4867; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4868; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
4869; GFX1132_DPP-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
4870; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4871; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, 0, 0
4872; GFX1132_DPP-NEXT:    v_add_nc_u32_e32 v1, v1, v2
4873; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
4874; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
4875; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
4876; GFX1132_DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
4877; GFX1132_DPP-NEXT:    s_mov_b32 s0, exec_lo
4878; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4879; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
4880; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB11_2
4881; GFX1132_DPP-NEXT:  ; %bb.1:
4882; GFX1132_DPP-NEXT:    ds_sub_u32 v0, v3
4883; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
4884; GFX1132_DPP-NEXT:    buffer_gl0_inv
4885; GFX1132_DPP-NEXT:  .LBB11_2:
4886; GFX1132_DPP-NEXT:    s_endpgm
4887entry:
4888  %lane = call i32 @llvm.amdgcn.workitem.id.x()
4889  %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel
4890  ret void
4891}
4892
4893define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
4894; GFX7LESS-LABEL: sub_i64_constant:
4895; GFX7LESS:       ; %bb.0: ; %entry
4896; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
4897; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
4898; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s3, v0
4899; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
4900; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
4901; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4902; GFX7LESS-NEXT:    s_cbranch_execz .LBB12_2
4903; GFX7LESS-NEXT:  ; %bb.1:
4904; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
4905; GFX7LESS-NEXT:    s_mul_i32 s2, s2, 5
4906; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
4907; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
4908; GFX7LESS-NEXT:    s_mov_b32 m0, -1
4909; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
4910; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4911; GFX7LESS-NEXT:  .LBB12_2:
4912; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
4913; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4914; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
4915; GFX7LESS-NEXT:    s_mov_b32 s2, -1
4916; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
4917; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v0
4918; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
4919; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
4920; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
4921; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
4922; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
4923; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
4924; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4925; GFX7LESS-NEXT:    s_endpgm
4926;
4927; GFX8-LABEL: sub_i64_constant:
4928; GFX8:       ; %bb.0: ; %entry
4929; GFX8-NEXT:    s_mov_b64 s[2:3], exec
4930; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4931; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s3, v0
4932; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
4933; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4934; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4935; GFX8-NEXT:    s_cbranch_execz .LBB12_2
4936; GFX8-NEXT:  ; %bb.1:
4937; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
4938; GFX8-NEXT:    s_mul_i32 s2, s2, 5
4939; GFX8-NEXT:    v_mov_b32_e32 v0, s2
4940; GFX8-NEXT:    v_mov_b32_e32 v1, 0
4941; GFX8-NEXT:    s_mov_b32 m0, -1
4942; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
4943; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4944; GFX8-NEXT:  .LBB12_2:
4945; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
4946; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4947; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
4948; GFX8-NEXT:    v_readfirstlane_b32 s5, v0
4949; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
4950; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
4951; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4952; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s5, v0
4953; GFX8-NEXT:    s_mov_b32 s3, 0xf000
4954; GFX8-NEXT:    s_mov_b32 s2, -1
4955; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
4956; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4957; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4958; GFX8-NEXT:    s_endpgm
4959;
4960; GFX9-LABEL: sub_i64_constant:
4961; GFX9:       ; %bb.0: ; %entry
4962; GFX9-NEXT:    s_mov_b64 s[2:3], exec
4963; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4964; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s3, v0
4965; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
4966; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
4967; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4968; GFX9-NEXT:    s_cbranch_execz .LBB12_2
4969; GFX9-NEXT:  ; %bb.1:
4970; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
4971; GFX9-NEXT:    s_mul_i32 s2, s2, 5
4972; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4973; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4974; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
4975; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4976; GFX9-NEXT:  .LBB12_2:
4977; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
4978; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4979; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
4980; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
4981; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
4982; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
4983; GFX9-NEXT:    v_mov_b32_e32 v2, s4
4984; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v0
4985; GFX9-NEXT:    s_mov_b32 s3, 0xf000
4986; GFX9-NEXT:    s_mov_b32 s2, -1
4987; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
4988; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4989; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4990; GFX9-NEXT:    s_endpgm
4991;
4992; GFX1064-LABEL: sub_i64_constant:
4993; GFX1064:       ; %bb.0: ; %entry
4994; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
4995; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
4996; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s3, v0
4997; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
4998; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
4999; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
5000; GFX1064-NEXT:    s_cbranch_execz .LBB12_2
5001; GFX1064-NEXT:  ; %bb.1:
5002; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
5003; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
5004; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
5005; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
5006; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
5007; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5008; GFX1064-NEXT:    buffer_gl0_inv
5009; GFX1064-NEXT:  .LBB12_2:
5010; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5011; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
5012; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5013; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5014; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
5015; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
5016; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
5017; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
5018; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
5019; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5020; GFX1064-NEXT:    s_mov_b32 s2, -1
5021; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5022; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5023; GFX1064-NEXT:    s_endpgm
5024;
5025; GFX1032-LABEL: sub_i64_constant:
5026; GFX1032:       ; %bb.0: ; %entry
5027; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
5028; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5029; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s1, 0
5030; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
5031; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
5032; GFX1032-NEXT:    s_cbranch_execz .LBB12_2
5033; GFX1032-NEXT:  ; %bb.1:
5034; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
5035; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
5036; GFX1032-NEXT:    s_mul_i32 s1, s1, 5
5037; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
5038; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
5039; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5040; GFX1032-NEXT:    buffer_gl0_inv
5041; GFX1032-NEXT:  .LBB12_2:
5042; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5043; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5044; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5045; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5046; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
5047; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
5048; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
5049; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
5050; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
5051; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5052; GFX1032-NEXT:    s_mov_b32 s2, -1
5053; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5054; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5055; GFX1032-NEXT:    s_endpgm
5056;
5057; GFX1164-LABEL: sub_i64_constant:
5058; GFX1164:       ; %bb.0: ; %entry
5059; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
5060; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
5061; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
5062; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5063; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s3, v0
5064; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5065; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
5066; GFX1164-NEXT:    s_cbranch_execz .LBB12_2
5067; GFX1164-NEXT:  ; %bb.1:
5068; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
5069; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
5070; GFX1164-NEXT:    s_mul_i32 s2, s2, 5
5071; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5072; GFX1164-NEXT:    v_mov_b32_e32 v0, s2
5073; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
5074; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5075; GFX1164-NEXT:    buffer_gl0_inv
5076; GFX1164-NEXT:  .LBB12_2:
5077; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
5078; GFX1164-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
5079; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5080; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
5081; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
5082; GFX1164-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
5083; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5084; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
5085; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
5086; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5087; GFX1164-NEXT:    s_mov_b32 s2, -1
5088; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5089; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5090; GFX1164-NEXT:    s_endpgm
5091;
5092; GFX1132-LABEL: sub_i64_constant:
5093; GFX1132:       ; %bb.0: ; %entry
5094; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
5095; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
5096; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s1, 0
5097; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
5098; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5099; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
5100; GFX1132-NEXT:    s_cbranch_execz .LBB12_2
5101; GFX1132-NEXT:  ; %bb.1:
5102; GFX1132-NEXT:    s_bcnt1_i32_b32 s1, s1
5103; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
5104; GFX1132-NEXT:    s_mul_i32 s1, s1, 5
5105; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5106; GFX1132-NEXT:    v_mov_b32_e32 v0, s1
5107; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
5108; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5109; GFX1132-NEXT:    buffer_gl0_inv
5110; GFX1132-NEXT:  .LBB12_2:
5111; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5112; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
5113; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
5114; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
5115; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
5116; GFX1132-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
5117; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5118; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
5119; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
5120; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5121; GFX1132-NEXT:    s_mov_b32 s2, -1
5122; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5123; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5124; GFX1132-NEXT:    s_endpgm
5125entry:
5126  %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 5 acq_rel
5127  store i64 %old, ptr addrspace(1) %out
5128  ret void
5129}
5130
5131define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) {
5132; GFX7LESS-LABEL: sub_i64_uniform:
5133; GFX7LESS:       ; %bb.0: ; %entry
5134; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
5135; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5136; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
5137; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
5138; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5139; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
5140; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5141; GFX7LESS-NEXT:    s_cbranch_execz .LBB13_2
5142; GFX7LESS-NEXT:  ; %bb.1:
5143; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
5144; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
5145; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5146; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
5147; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
5148; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
5149; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
5150; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
5151; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
5152; GFX7LESS-NEXT:    s_mov_b32 m0, -1
5153; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
5154; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5155; GFX7LESS-NEXT:  .LBB13_2:
5156; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
5157; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
5158; GFX7LESS-NEXT:    s_mov_b32 s6, -1
5159; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
5160; GFX7LESS-NEXT:    s_mov_b32 s4, s0
5161; GFX7LESS-NEXT:    s_mov_b32 s5, s1
5162; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
5163; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v0
5164; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
5165; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
5166; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
5167; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
5168; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s0
5169; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s1, v2
5170; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
5171; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5172; GFX7LESS-NEXT:    s_endpgm
5173;
5174; GFX8-LABEL: sub_i64_uniform:
5175; GFX8:       ; %bb.0: ; %entry
5176; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5177; GFX8-NEXT:    s_mov_b64 s[6:7], exec
5178; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
5179; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
5180; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5181; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5182; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5183; GFX8-NEXT:    s_cbranch_execz .LBB13_2
5184; GFX8-NEXT:  ; %bb.1:
5185; GFX8-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
5186; GFX8-NEXT:    v_mov_b32_e32 v0, s8
5187; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5188; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
5189; GFX8-NEXT:    s_mul_i32 s6, s3, s8
5190; GFX8-NEXT:    v_mov_b32_e32 v3, 0
5191; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
5192; GFX8-NEXT:    s_mov_b32 m0, -1
5193; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
5194; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5195; GFX8-NEXT:  .LBB13_2:
5196; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5197; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5198; GFX8-NEXT:    s_mov_b32 s4, s0
5199; GFX8-NEXT:    s_mov_b32 s5, s1
5200; GFX8-NEXT:    v_mul_lo_u32 v4, s3, v2
5201; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
5202; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
5203; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
5204; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v4
5205; GFX8-NEXT:    v_mov_b32_e32 v3, s0
5206; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s1, v2
5207; GFX8-NEXT:    s_mov_b32 s7, 0xf000
5208; GFX8-NEXT:    s_mov_b32 s6, -1
5209; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
5210; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5211; GFX8-NEXT:    s_endpgm
5212;
5213; GFX9-LABEL: sub_i64_uniform:
5214; GFX9:       ; %bb.0: ; %entry
5215; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5216; GFX9-NEXT:    s_mov_b64 s[6:7], exec
5217; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
5218; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
5219; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5220; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
5221; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5222; GFX9-NEXT:    s_cbranch_execz .LBB13_2
5223; GFX9-NEXT:  ; %bb.1:
5224; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
5225; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5226; GFX9-NEXT:    s_mul_i32 s7, s3, s6
5227; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
5228; GFX9-NEXT:    s_add_i32 s8, s8, s7
5229; GFX9-NEXT:    s_mul_i32 s6, s2, s6
5230; GFX9-NEXT:    v_mov_b32_e32 v0, s6
5231; GFX9-NEXT:    v_mov_b32_e32 v1, s8
5232; GFX9-NEXT:    v_mov_b32_e32 v3, 0
5233; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
5234; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5235; GFX9-NEXT:  .LBB13_2:
5236; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5237; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5238; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
5239; GFX9-NEXT:    s_mov_b32 s4, s0
5240; GFX9-NEXT:    s_mov_b32 s5, s1
5241; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
5242; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
5243; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
5244; GFX9-NEXT:    v_mov_b32_e32 v1, v4
5245; GFX9-NEXT:    v_mov_b32_e32 v2, s0
5246; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s1, v3
5247; GFX9-NEXT:    s_mov_b32 s7, 0xf000
5248; GFX9-NEXT:    s_mov_b32 s6, -1
5249; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
5250; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5251; GFX9-NEXT:    s_endpgm
5252;
5253; GFX1064-LABEL: sub_i64_uniform:
5254; GFX1064:       ; %bb.0: ; %entry
5255; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5256; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
5257; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
5258; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
5259; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
5260; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5261; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5262; GFX1064-NEXT:    s_cbranch_execz .LBB13_2
5263; GFX1064-NEXT:  ; %bb.1:
5264; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
5265; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
5266; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5267; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
5268; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
5269; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
5270; GFX1064-NEXT:    s_add_i32 s8, s8, s7
5271; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
5272; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
5273; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
5274; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5275; GFX1064-NEXT:    buffer_gl0_inv
5276; GFX1064-NEXT:  .LBB13_2:
5277; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
5278; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
5279; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
5280; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
5281; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
5282; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
5283; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
5284; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
5285; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
5286; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
5287; GFX1064-NEXT:    s_mov_b32 s2, -1
5288; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
5289; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5290; GFX1064-NEXT:    s_endpgm
5291;
5292; GFX1032-LABEL: sub_i64_uniform:
5293; GFX1032:       ; %bb.0: ; %entry
5294; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5295; GFX1032-NEXT:    s_mov_b32 s6, exec_lo
5296; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
5297; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
5298; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
5299; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
5300; GFX1032-NEXT:    s_cbranch_execz .LBB13_2
5301; GFX1032-NEXT:  ; %bb.1:
5302; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s6
5303; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
5304; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5305; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
5306; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
5307; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
5308; GFX1032-NEXT:    s_add_i32 s7, s7, s6
5309; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
5310; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
5311; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
5312; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5313; GFX1032-NEXT:    buffer_gl0_inv
5314; GFX1032-NEXT:  .LBB13_2:
5315; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
5316; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5317; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
5318; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s2, s2, v2, 0
5319; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
5320; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
5321; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
5322; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
5323; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
5324; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
5325; GFX1032-NEXT:    s_mov_b32 s2, -1
5326; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
5327; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5328; GFX1032-NEXT:    s_endpgm
5329;
5330; GFX1164-LABEL: sub_i64_uniform:
5331; GFX1164:       ; %bb.0: ; %entry
5332; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5333; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
5334; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
5335; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
5336; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5337; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
5338; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
5339; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
5340; GFX1164-NEXT:    s_cbranch_execz .LBB13_2
5341; GFX1164-NEXT:  ; %bb.1:
5342; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
5343; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
5344; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5345; GFX1164-NEXT:    s_mul_i32 s7, s3, s6
5346; GFX1164-NEXT:    s_mul_hi_u32 s8, s2, s6
5347; GFX1164-NEXT:    s_mul_i32 s6, s2, s6
5348; GFX1164-NEXT:    s_add_i32 s8, s8, s7
5349; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
5350; GFX1164-NEXT:    v_mov_b32_e32 v1, s8
5351; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
5352; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5353; GFX1164-NEXT:    buffer_gl0_inv
5354; GFX1164-NEXT:  .LBB13_2:
5355; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
5356; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
5357; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
5358; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
5359; GFX1164-NEXT:    v_readfirstlane_b32 s4, v1
5360; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
5361; GFX1164-NEXT:    v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
5362; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
5363; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
5364; GFX1164-NEXT:    s_mov_b32 s2, -1
5365; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
5366; GFX1164-NEXT:    v_mov_b32_e32 v1, v5
5367; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
5368; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5369; GFX1164-NEXT:    s_endpgm
5370;
5371; GFX1132-LABEL: sub_i64_uniform:
5372; GFX1132:       ; %bb.0: ; %entry
5373; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5374; GFX1132-NEXT:    s_mov_b32 s6, exec_lo
5375; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
5376; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
5377; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
5378; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5379; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
5380; GFX1132-NEXT:    s_cbranch_execz .LBB13_2
5381; GFX1132-NEXT:  ; %bb.1:
5382; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s6
5383; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
5384; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5385; GFX1132-NEXT:    s_mul_i32 s6, s3, s5
5386; GFX1132-NEXT:    s_mul_hi_u32 s7, s2, s5
5387; GFX1132-NEXT:    s_mul_i32 s5, s2, s5
5388; GFX1132-NEXT:    s_add_i32 s7, s7, s6
5389; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5390; GFX1132-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7
5391; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
5392; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5393; GFX1132-NEXT:    buffer_gl0_inv
5394; GFX1132-NEXT:  .LBB13_2:
5395; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5396; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
5397; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
5398; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
5399; GFX1132-NEXT:    v_readfirstlane_b32 s4, v1
5400; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
5401; GFX1132-NEXT:    v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
5402; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
5403; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
5404; GFX1132-NEXT:    s_mov_b32 s2, -1
5405; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
5406; GFX1132-NEXT:    v_mov_b32_e32 v1, v5
5407; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
5408; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5409; GFX1132-NEXT:    s_endpgm
5410entry:
5411  %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 %subitive acq_rel
5412  store i64 %old, ptr addrspace(1) %out
5413  ret void
5414}
5415
5416define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
5417; GFX7LESS_ITERATIVE-LABEL: sub_i64_varying:
5418; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
5419; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
5420; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
5421; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
5422; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
5423; GFX7LESS_ITERATIVE-NEXT:  .LBB14_1: ; %ComputeLoop
5424; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
5425; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
5426; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s6
5427; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
5428; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s6
5429; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
5430; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
5431; GFX7LESS_ITERATIVE-NEXT:    s_add_u32 s0, s0, s8
5432; GFX7LESS_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
5433; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
5434; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
5435; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
5436; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
5437; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB14_1
5438; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
5439; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
5440; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
5441; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5442; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
5443; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5444; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
5445; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB14_4
5446; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
5447; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
5448; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
5449; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
5450; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
5451; GFX7LESS_ITERATIVE-NEXT:    ds_sub_rtn_u64 v[3:4], v0, v[3:4]
5452; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5453; GFX7LESS_ITERATIVE-NEXT:  .LBB14_4:
5454; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
5455; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
5456; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
5457; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
5458; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
5459; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
5460; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
5461; GFX7LESS_ITERATIVE-NEXT:    v_sub_i32_e32 v0, vcc, s5, v1
5462; GFX7LESS_ITERATIVE-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
5463; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5464; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5465; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
5466;
5467; GFX8_ITERATIVE-LABEL: sub_i64_varying:
5468; GFX8_ITERATIVE:       ; %bb.0: ; %entry
5469; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
5470; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
5471; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
5472; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
5473; GFX8_ITERATIVE-NEXT:  .LBB14_1: ; %ComputeLoop
5474; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
5475; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
5476; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s6
5477; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s6
5478; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
5479; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
5480; GFX8_ITERATIVE-NEXT:    s_add_u32 s0, s0, s8
5481; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
5482; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
5483; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
5484; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
5485; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
5486; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
5487; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
5488; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5489; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5490; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5491; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
5492; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5493; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
5494; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB14_4
5495; GFX8_ITERATIVE-NEXT:  ; %bb.3:
5496; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
5497; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
5498; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
5499; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
5500; GFX8_ITERATIVE-NEXT:    ds_sub_rtn_u64 v[3:4], v0, v[3:4]
5501; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5502; GFX8_ITERATIVE-NEXT:  .LBB14_4:
5503; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
5504; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5505; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
5506; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
5507; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
5508; GFX8_ITERATIVE-NEXT:    v_sub_u32_e32 v0, vcc, s5, v1
5509; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
5510; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
5511; GFX8_ITERATIVE-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
5512; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5513; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5514; GFX8_ITERATIVE-NEXT:    s_endpgm
5515;
5516; GFX9_ITERATIVE-LABEL: sub_i64_varying:
5517; GFX9_ITERATIVE:       ; %bb.0: ; %entry
5518; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
5519; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
5520; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
5521; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
5522; GFX9_ITERATIVE-NEXT:  .LBB14_1: ; %ComputeLoop
5523; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
5524; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
5525; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s6
5526; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s6
5527; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
5528; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
5529; GFX9_ITERATIVE-NEXT:    s_add_u32 s0, s0, s8
5530; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
5531; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
5532; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
5533; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
5534; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
5535; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
5536; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
5537; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5538; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5539; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5540; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
5541; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5542; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
5543; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB14_4
5544; GFX9_ITERATIVE-NEXT:  ; %bb.3:
5545; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
5546; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
5547; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
5548; GFX9_ITERATIVE-NEXT:    ds_sub_rtn_u64 v[3:4], v0, v[3:4]
5549; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5550; GFX9_ITERATIVE-NEXT:  .LBB14_4:
5551; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
5552; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5553; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
5554; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
5555; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
5556; GFX9_ITERATIVE-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v1
5557; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
5558; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
5559; GFX9_ITERATIVE-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v2, vcc
5560; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5561; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5562; GFX9_ITERATIVE-NEXT:    s_endpgm
5563;
5564; GFX1064_ITERATIVE-LABEL: sub_i64_varying:
5565; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
5566; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
5567; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
5568; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
5569; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
5570; GFX1064_ITERATIVE-NEXT:  .LBB14_1: ; %ComputeLoop
5571; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
5572; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
5573; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
5574; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
5575; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
5576; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
5577; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
5578; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
5579; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
5580; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
5581; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
5582; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
5583; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
5584; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5585; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
5586; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5587; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
5588; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5589; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
5590; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB14_4
5591; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
5592; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
5593; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
5594; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
5595; GFX1064_ITERATIVE-NEXT:    ds_sub_rtn_u64 v[3:4], v0, v[3:4]
5596; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5597; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
5598; GFX1064_ITERATIVE-NEXT:  .LBB14_4:
5599; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
5600; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
5601; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5602; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
5603; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
5604; GFX1064_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc, s2, v1
5605; GFX1064_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
5606; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
5607; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
5608; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5609; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5610; GFX1064_ITERATIVE-NEXT:    s_endpgm
5611;
5612; GFX1032_ITERATIVE-LABEL: sub_i64_varying:
5613; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
5614; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
5615; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
5616; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
5617; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
5618; GFX1032_ITERATIVE-NEXT:  .LBB14_1: ; %ComputeLoop
5619; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
5620; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
5621; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
5622; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
5623; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
5624; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
5625; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
5626; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
5627; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
5628; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
5629; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
5630; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
5631; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
5632; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5633; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
5634; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
5635; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5636; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
5637; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB14_4
5638; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
5639; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
5640; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
5641; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
5642; GFX1032_ITERATIVE-NEXT:    ds_sub_rtn_u64 v[3:4], v0, v[3:4]
5643; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5644; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
5645; GFX1032_ITERATIVE-NEXT:  .LBB14_4:
5646; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
5647; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5648; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5649; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
5650; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
5651; GFX1032_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v1
5652; GFX1032_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
5653; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
5654; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
5655; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5656; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5657; GFX1032_ITERATIVE-NEXT:    s_endpgm
5658;
5659; GFX1164_ITERATIVE-LABEL: sub_i64_varying:
5660; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
5661; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
5662; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
5663; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
5664; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
5665; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
5666; GFX1164_ITERATIVE-NEXT:  .LBB14_1: ; %ComputeLoop
5667; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
5668; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s6, s[2:3]
5669; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
5670; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s6
5671; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
5672; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
5673; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s6
5674; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
5675; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
5676; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
5677; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
5678; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
5679; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5680; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
5681; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
5682; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
5683; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
5684; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5685; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
5686; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5687; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
5688; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
5689; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5690; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
5691; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB14_4
5692; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
5693; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s1
5694; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
5695; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
5696; GFX1164_ITERATIVE-NEXT:    ds_sub_rtn_u64 v[2:3], v4, v[2:3]
5697; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5698; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
5699; GFX1164_ITERATIVE-NEXT:  .LBB14_4:
5700; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
5701; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
5702; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
5703; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
5704; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
5705; GFX1164_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
5706; GFX1164_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
5707; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
5708; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
5709; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5710; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5711; GFX1164_ITERATIVE-NEXT:    s_endpgm
5712;
5713; GFX1132_ITERATIVE-LABEL: sub_i64_varying:
5714; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
5715; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
5716; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
5717; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
5718; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
5719; GFX1132_ITERATIVE-NEXT:  .LBB14_1: ; %ComputeLoop
5720; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
5721; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
5722; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
5723; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
5724; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
5725; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
5726; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
5727; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
5728; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
5729; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
5730; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
5731; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5732; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
5733; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
5734; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
5735; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
5736; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
5737; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
5738; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
5739; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
5740; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
5741; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
5742; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB14_4
5743; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
5744; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1
5745; GFX1132_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
5746; GFX1132_ITERATIVE-NEXT:    ds_sub_rtn_u64 v[2:3], v4, v[2:3]
5747; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5748; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
5749; GFX1132_ITERATIVE-NEXT:  .LBB14_4:
5750; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
5751; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
5752; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
5753; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
5754; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
5755; GFX1132_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
5756; GFX1132_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
5757; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
5758; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
5759; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
5760; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
5761; GFX1132_ITERATIVE-NEXT:    s_endpgm
5762;
5763; GFX7LESS_DPP-LABEL: sub_i64_varying:
5764; GFX7LESS_DPP:       ; %bb.0: ; %entry
5765; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
5766; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
5767; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
5768; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
5769; GFX7LESS_DPP-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
5770; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
5771; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
5772; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
5773; GFX7LESS_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5774; GFX7LESS_DPP-NEXT:    s_endpgm
5775;
5776; GFX8_DPP-LABEL: sub_i64_varying:
5777; GFX8_DPP:       ; %bb.0: ; %entry
5778; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
5779; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
5780; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
5781; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
5782; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
5783; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
5784; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
5785; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
5786; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
5787; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
5788; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5789; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
5790; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
5791; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
5792; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
5793; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
5794; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5795; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
5796; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
5797; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
5798; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
5799; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
5800; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5801; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
5802; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
5803; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
5804; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
5805; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
5806; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5807; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
5808; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
5809; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
5810; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5811; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
5812; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
5813; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
5814; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
5815; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
5816; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5817; GFX8_DPP-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
5818; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
5819; GFX8_DPP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
5820; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
5821; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
5822; GFX8_DPP-NEXT:    v_readlane_b32 s3, v1, 63
5823; GFX8_DPP-NEXT:    v_readlane_b32 s2, v2, 63
5824; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
5825; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5826; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
5827; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
5828; GFX8_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
5829; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
5830; GFX8_DPP-NEXT:    s_cbranch_execz .LBB14_2
5831; GFX8_DPP-NEXT:  ; %bb.1:
5832; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s3
5833; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, s2
5834; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
5835; GFX8_DPP-NEXT:    ds_sub_rtn_u64 v[5:6], v7, v[5:6]
5836; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
5837; GFX8_DPP-NEXT:  .LBB14_2:
5838; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
5839; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5840; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v6
5841; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v5
5842; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
5843; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
5844; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
5845; GFX8_DPP-NEXT:    v_sub_u32_e32 v5, vcc, s5, v5
5846; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
5847; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
5848; GFX8_DPP-NEXT:    v_subb_u32_e32 v6, vcc, v0, v6, vcc
5849; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
5850; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
5851; GFX8_DPP-NEXT:    s_endpgm
5852;
5853; GFX9_DPP-LABEL: sub_i64_varying:
5854; GFX9_DPP:       ; %bb.0: ; %entry
5855; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
5856; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
5857; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
5858; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
5859; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
5860; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
5861; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
5862; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
5863; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
5864; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
5865; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5866; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
5867; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
5868; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
5869; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
5870; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
5871; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5872; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
5873; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
5874; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
5875; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
5876; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
5877; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5878; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
5879; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
5880; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
5881; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
5882; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
5883; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5884; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
5885; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
5886; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
5887; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
5888; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
5889; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
5890; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
5891; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
5892; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
5893; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
5894; GFX9_DPP-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
5895; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
5896; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
5897; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
5898; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
5899; GFX9_DPP-NEXT:    v_readlane_b32 s3, v1, 63
5900; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
5901; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
5902; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
5903; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
5904; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
5905; GFX9_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
5906; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
5907; GFX9_DPP-NEXT:    s_cbranch_execz .LBB14_2
5908; GFX9_DPP-NEXT:  ; %bb.1:
5909; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s3
5910; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, s2
5911; GFX9_DPP-NEXT:    ds_sub_rtn_u64 v[5:6], v7, v[5:6]
5912; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
5913; GFX9_DPP-NEXT:  .LBB14_2:
5914; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
5915; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5916; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v6
5917; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v5
5918; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
5919; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
5920; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
5921; GFX9_DPP-NEXT:    v_sub_co_u32_e32 v5, vcc, s5, v5
5922; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
5923; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
5924; GFX9_DPP-NEXT:    v_subb_co_u32_e32 v6, vcc, v0, v6, vcc
5925; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
5926; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
5927; GFX9_DPP-NEXT:    s_endpgm
5928;
5929; GFX1064_DPP-LABEL: sub_i64_varying:
5930; GFX1064_DPP:       ; %bb.0: ; %entry
5931; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
5932; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
5933; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
5934; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
5935; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
5936; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
5937; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
5938; GFX1064_DPP-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
5939; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, 0
5940; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
5941; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, 0
5942; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v2, v1
5943; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v3, v4, vcc
5944; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
5945; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
5946; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
5947; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf
5948; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v6
5949; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
5950; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
5951; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf
5952; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
5953; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
5954; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v4
5955; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v3, vcc
5956; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
5957; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
5958; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf
5959; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v6
5960; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
5961; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
5962; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
5963; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v2, -1, -1
5964; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5965; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
5966; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v3
5967; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
5968; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
5969; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
5970; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
5971; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 31
5972; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, s2
5973; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s3
5974; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5975; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
5976; GFX1064_DPP-NEXT:    v_add_co_u32 v1, vcc, v1, v3
5977; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc
5978; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
5979; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5980; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
5981; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
5982; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf
5983; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 15
5984; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 15
5985; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 31
5986; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
5987; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
5988; GFX1064_DPP-NEXT:    v_writelane_b32 v8, s2, 16
5989; GFX1064_DPP-NEXT:    v_writelane_b32 v7, s3, 16
5990; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 63
5991; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v2, 47
5992; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 63
5993; GFX1064_DPP-NEXT:    v_writelane_b32 v8, s6, 32
5994; GFX1064_DPP-NEXT:    v_writelane_b32 v7, s7, 32
5995; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
5996; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v0
5997; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
5998; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
5999; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
6000; GFX1064_DPP-NEXT:    v_writelane_b32 v8, s9, 48
6001; GFX1064_DPP-NEXT:    v_writelane_b32 v7, s8, 48
6002; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
6003; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
6004; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
6005; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
6006; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
6007; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB14_2
6008; GFX1064_DPP-NEXT:  ; %bb.1:
6009; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
6010; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
6011; GFX1064_DPP-NEXT:    ds_sub_rtn_u64 v[9:10], v0, v[9:10]
6012; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6013; GFX1064_DPP-NEXT:    buffer_gl0_inv
6014; GFX1064_DPP-NEXT:  .LBB14_2:
6015; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
6016; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
6017; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6018; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v9
6019; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v7
6020; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, v8
6021; GFX1064_DPP-NEXT:    s_mov_b32 null, 0
6022; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s4, v10
6023; GFX1064_DPP-NEXT:    v_sub_co_u32 v9, vcc, s3, v11
6024; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
6025; GFX1064_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc, s4, v12, vcc
6026; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6027; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
6028; GFX1064_DPP-NEXT:    s_endpgm
6029;
6030; GFX1032_DPP-LABEL: sub_i64_varying:
6031; GFX1032_DPP:       ; %bb.0: ; %entry
6032; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
6033; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
6034; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s2
6035; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s2
6036; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
6037; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
6038; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
6039; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6040; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, 0
6041; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
6042; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, 0
6043; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v1
6044; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v4, vcc_lo
6045; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
6046; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6047; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
6048; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6049; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v6
6050; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
6051; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
6052; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6053; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
6054; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6055; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v4
6056; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo
6057; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
6058; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6059; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6060; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v6
6061; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
6062; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
6063; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
6064; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v2, -1, -1
6065; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6066; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6067; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
6068; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
6069; GFX1032_DPP-NEXT:    v_readlane_b32 s3, v1, 15
6070; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
6071; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
6072; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6073; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6074; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
6075; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
6076; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
6077; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
6078; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
6079; GFX1032_DPP-NEXT:    v_writelane_b32 v8, s6, 16
6080; GFX1032_DPP-NEXT:    v_writelane_b32 v7, s3, 16
6081; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
6082; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
6083; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
6084; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
6085; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6086; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB14_2
6087; GFX1032_DPP-NEXT:  ; %bb.1:
6088; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
6089; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
6090; GFX1032_DPP-NEXT:    ds_sub_rtn_u64 v[9:10], v0, v[9:10]
6091; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6092; GFX1032_DPP-NEXT:    buffer_gl0_inv
6093; GFX1032_DPP-NEXT:  .LBB14_2:
6094; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
6095; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6096; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6097; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v9
6098; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v7
6099; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v8
6100; GFX1032_DPP-NEXT:    s_mov_b32 null, 0
6101; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s4, v10
6102; GFX1032_DPP-NEXT:    v_sub_co_u32 v9, vcc_lo, s3, v11
6103; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
6104; GFX1032_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, s4, v12, vcc_lo
6105; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6106; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
6107; GFX1032_DPP-NEXT:    s_endpgm
6108;
6109; GFX1164_DPP-LABEL: sub_i64_varying:
6110; GFX1164_DPP:       ; %bb.0: ; %entry
6111; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
6112; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
6113; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6114; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
6115; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
6116; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
6117; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
6118; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, 0
6119; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
6120; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6121; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
6122; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6123; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
6124; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
6125; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
6126; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
6127; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6128; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
6129; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6130; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
6131; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
6132; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6133; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
6134; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
6135; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
6136; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
6137; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6138; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6139; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
6140; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6141; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
6142; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6143; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
6144; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
6145; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
6146; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6147; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6148; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
6149; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
6150; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
6151; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6152; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s2
6153; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 31
6154; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6155; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6156; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6157; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
6158; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
6159; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6160; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6161; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
6162; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6163; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 15
6164; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6165; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 15
6166; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 31
6167; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
6168; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s2, 16
6169; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 63
6170; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s3, 16
6171; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v2, 47
6172; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v1, 47
6173; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 63
6174; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s6, 32
6175; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s7, 32
6176; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
6177; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6178; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
6179; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
6180; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
6181; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
6182; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s8, 48
6183; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s9, 48
6184; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
6185; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
6186; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
6187; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
6188; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
6189; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB14_2
6190; GFX1164_DPP-NEXT:  ; %bb.1:
6191; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
6192; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
6193; GFX1164_DPP-NEXT:    ds_sub_rtn_u64 v[7:8], v0, v[7:8]
6194; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6195; GFX1164_DPP-NEXT:    buffer_gl0_inv
6196; GFX1164_DPP-NEXT:  .LBB14_2:
6197; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
6198; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
6199; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v7
6200; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v5
6201; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v6
6202; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s4, v8
6203; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6204; GFX1164_DPP-NEXT:    v_sub_co_u32 v7, vcc, s3, v9
6205; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
6206; GFX1164_DPP-NEXT:    v_sub_co_ci_u32_e32 v8, vcc, s4, v10, vcc
6207; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6208; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
6209; GFX1164_DPP-NEXT:    s_endpgm
6210;
6211; GFX1132_DPP-LABEL: sub_i64_varying:
6212; GFX1132_DPP:       ; %bb.0: ; %entry
6213; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
6214; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
6215; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6216; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s2
6217; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
6218; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s2
6219; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0
6220; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, 0
6221; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
6222; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6223; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
6224; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6225; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
6226; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
6227; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
6228; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
6229; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6230; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
6231; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6232; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
6233; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
6234; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6235; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
6236; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
6237; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
6238; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
6239; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6240; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6241; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
6242; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6243; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
6244; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6245; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6246; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v1, -1, -1
6247; GFX1132_DPP-NEXT:    v_readlane_b32 s3, v2, 15
6248; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v2, 31
6249; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6250; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
6251; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6252; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
6253; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
6254; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 31
6255; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6256; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
6257; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
6258; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
6259; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
6260; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
6261; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
6262; GFX1132_DPP-NEXT:    v_writelane_b32 v7, s6, 16
6263; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
6264; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
6265; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
6266; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
6267; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
6268; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB14_2
6269; GFX1132_DPP-NEXT:  ; %bb.1:
6270; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
6271; GFX1132_DPP-NEXT:    ds_sub_rtn_u64 v[8:9], v0, v[8:9]
6272; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6273; GFX1132_DPP-NEXT:    buffer_gl0_inv
6274; GFX1132_DPP-NEXT:  .LBB14_2:
6275; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
6276; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
6277; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v8
6278; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v6
6279; GFX1132_DPP-NEXT:    v_mov_b32_e32 v11, v7
6280; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v9
6281; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
6282; GFX1132_DPP-NEXT:    v_sub_co_u32 v8, vcc_lo, s3, v10
6283; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
6284; GFX1132_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo
6285; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6286; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
6287; GFX1132_DPP-NEXT:    s_endpgm
6288entry:
6289  %lane = call i32 @llvm.amdgcn.workitem.id.x()
6290  %zext = zext i32 %lane to i64
6291  %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 %zext acq_rel
6292  store i64 %old, ptr addrspace(1) %out
6293  ret void
6294}
6295
6296define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
6297; GFX7LESS_ITERATIVE-LABEL: and_i32_varying:
6298; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
6299; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
6300; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6301; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
6302; GFX7LESS_ITERATIVE-NEXT:  .LBB15_1: ; %ComputeLoop
6303; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6304; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
6305; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s3
6306; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
6307; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
6308; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
6309; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
6310; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
6311; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
6312; GFX7LESS_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
6313; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB15_1
6314; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6315; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
6316; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
6317; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6318; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
6319; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
6320; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
6321; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB15_4
6322; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
6323; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
6324; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
6325; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
6326; GFX7LESS_ITERATIVE-NEXT:    ds_and_rtn_b32 v0, v0, v2
6327; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6328; GFX7LESS_ITERATIVE-NEXT:  .LBB15_4:
6329; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
6330; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
6331; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
6332; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6333; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
6334; GFX7LESS_ITERATIVE-NEXT:    v_and_b32_e32 v0, s4, v1
6335; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6336; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6337; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
6338;
6339; GFX8_ITERATIVE-LABEL: and_i32_varying:
6340; GFX8_ITERATIVE:       ; %bb.0: ; %entry
6341; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
6342; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6343; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
6344; GFX8_ITERATIVE-NEXT:  .LBB15_1: ; %ComputeLoop
6345; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6346; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
6347; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
6348; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
6349; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
6350; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
6351; GFX8_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
6352; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
6353; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
6354; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
6355; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6356; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6357; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6358; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6359; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
6360; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
6361; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
6362; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB15_4
6363; GFX8_ITERATIVE-NEXT:  ; %bb.3:
6364; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
6365; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
6366; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
6367; GFX8_ITERATIVE-NEXT:    ds_and_rtn_b32 v0, v0, v2
6368; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6369; GFX8_ITERATIVE-NEXT:  .LBB15_4:
6370; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
6371; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6372; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
6373; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
6374; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6375; GFX8_ITERATIVE-NEXT:    v_and_b32_e32 v0, s4, v1
6376; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6377; GFX8_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6378; GFX8_ITERATIVE-NEXT:    s_endpgm
6379;
6380; GFX9_ITERATIVE-LABEL: and_i32_varying:
6381; GFX9_ITERATIVE:       ; %bb.0: ; %entry
6382; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
6383; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6384; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
6385; GFX9_ITERATIVE-NEXT:  .LBB15_1: ; %ComputeLoop
6386; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6387; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
6388; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
6389; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
6390; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
6391; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
6392; GFX9_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
6393; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
6394; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
6395; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
6396; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6397; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6398; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6399; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6400; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
6401; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
6402; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
6403; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB15_4
6404; GFX9_ITERATIVE-NEXT:  ; %bb.3:
6405; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
6406; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
6407; GFX9_ITERATIVE-NEXT:    ds_and_rtn_b32 v0, v0, v2
6408; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6409; GFX9_ITERATIVE-NEXT:  .LBB15_4:
6410; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
6411; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6412; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
6413; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
6414; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6415; GFX9_ITERATIVE-NEXT:    v_and_b32_e32 v0, s4, v1
6416; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6417; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6418; GFX9_ITERATIVE-NEXT:    s_endpgm
6419;
6420; GFX1064_ITERATIVE-LABEL: and_i32_varying:
6421; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
6422; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
6423; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6424; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
6425; GFX1064_ITERATIVE-NEXT:  .LBB15_1: ; %ComputeLoop
6426; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6427; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
6428; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
6429; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
6430; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
6431; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
6432; GFX1064_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
6433; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
6434; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
6435; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6436; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6437; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6438; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6439; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
6440; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
6441; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
6442; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB15_4
6443; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
6444; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
6445; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
6446; GFX1064_ITERATIVE-NEXT:    ds_and_rtn_b32 v0, v0, v2
6447; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6448; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
6449; GFX1064_ITERATIVE-NEXT:  .LBB15_4:
6450; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
6451; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
6452; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6453; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
6454; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
6455; GFX1064_ITERATIVE-NEXT:    v_and_b32_e32 v0, s2, v1
6456; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6457; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6458; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6459; GFX1064_ITERATIVE-NEXT:    s_endpgm
6460;
6461; GFX1032_ITERATIVE-LABEL: and_i32_varying:
6462; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
6463; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
6464; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, -1
6465; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
6466; GFX1032_ITERATIVE-NEXT:  .LBB15_1: ; %ComputeLoop
6467; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6468; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
6469; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
6470; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
6471; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
6472; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
6473; GFX1032_ITERATIVE-NEXT:    s_and_b32 s0, s0, s3
6474; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
6475; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
6476; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6477; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6478; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6479; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
6480; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
6481; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
6482; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB15_4
6483; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
6484; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
6485; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
6486; GFX1032_ITERATIVE-NEXT:    ds_and_rtn_b32 v0, v0, v2
6487; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6488; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
6489; GFX1032_ITERATIVE-NEXT:  .LBB15_4:
6490; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
6491; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
6492; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6493; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
6494; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
6495; GFX1032_ITERATIVE-NEXT:    v_and_b32_e32 v0, s2, v1
6496; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6497; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6498; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6499; GFX1032_ITERATIVE-NEXT:    s_endpgm
6500;
6501; GFX1164_ITERATIVE-LABEL: and_i32_varying:
6502; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
6503; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
6504; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
6505; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6506; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
6507; GFX1164_ITERATIVE-NEXT:  .LBB15_1: ; %ComputeLoop
6508; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6509; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
6510; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
6511; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
6512; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
6513; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
6514; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
6515; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
6516; GFX1164_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
6517; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
6518; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
6519; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6520; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
6521; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6522; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
6523; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
6524; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
6525; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
6526; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6527; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
6528; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB15_4
6529; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
6530; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
6531; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
6532; GFX1164_ITERATIVE-NEXT:    ds_and_rtn_b32 v1, v1, v2
6533; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6534; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
6535; GFX1164_ITERATIVE-NEXT:  .LBB15_4:
6536; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
6537; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
6538; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
6539; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
6540; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6541; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v0, s2, v0
6542; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6543; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6544; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6545; GFX1164_ITERATIVE-NEXT:    s_endpgm
6546;
6547; GFX1132_ITERATIVE-LABEL: and_i32_varying:
6548; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
6549; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
6550; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
6551; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, -1
6552; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
6553; GFX1132_ITERATIVE-NEXT:  .LBB15_1: ; %ComputeLoop
6554; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6555; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
6556; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
6557; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
6558; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
6559; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
6560; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
6561; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
6562; GFX1132_ITERATIVE-NEXT:    s_and_b32 s0, s0, s3
6563; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
6564; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
6565; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6566; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
6567; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
6568; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
6569; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
6570; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
6571; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
6572; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB15_4
6573; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
6574; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
6575; GFX1132_ITERATIVE-NEXT:    ds_and_rtn_b32 v1, v1, v2
6576; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6577; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
6578; GFX1132_ITERATIVE-NEXT:  .LBB15_4:
6579; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
6580; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
6581; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
6582; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
6583; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6584; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v0, s2, v0
6585; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6586; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6587; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6588; GFX1132_ITERATIVE-NEXT:    s_endpgm
6589;
6590; GFX7LESS_DPP-LABEL: and_i32_varying:
6591; GFX7LESS_DPP:       ; %bb.0: ; %entry
6592; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
6593; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
6594; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
6595; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6596; GFX7LESS_DPP-NEXT:    ds_and_rtn_b32 v0, v1, v0
6597; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6598; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
6599; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
6600; GFX7LESS_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6601; GFX7LESS_DPP-NEXT:    s_endpgm
6602;
6603; GFX8_DPP-LABEL: and_i32_varying:
6604; GFX8_DPP:       ; %bb.0: ; %entry
6605; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6606; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6607; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
6608; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
6609; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, -1
6610; GFX8_DPP-NEXT:    s_nop 0
6611; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6612; GFX8_DPP-NEXT:    s_nop 1
6613; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6614; GFX8_DPP-NEXT:    s_nop 1
6615; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6616; GFX8_DPP-NEXT:    s_nop 1
6617; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6618; GFX8_DPP-NEXT:    s_nop 1
6619; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6620; GFX8_DPP-NEXT:    s_nop 1
6621; GFX8_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6622; GFX8_DPP-NEXT:    v_readlane_b32 s2, v2, 63
6623; GFX8_DPP-NEXT:    s_nop 0
6624; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6625; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
6626; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6627; GFX8_DPP-NEXT:    ; implicit-def: $vgpr0
6628; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
6629; GFX8_DPP-NEXT:    s_cbranch_execz .LBB15_2
6630; GFX8_DPP-NEXT:  ; %bb.1:
6631; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, 0
6632; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, s2
6633; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
6634; GFX8_DPP-NEXT:    ds_and_rtn_b32 v0, v0, v3
6635; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6636; GFX8_DPP-NEXT:  .LBB15_2:
6637; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
6638; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6639; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v0
6640; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, v1
6641; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
6642; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
6643; GFX8_DPP-NEXT:    v_and_b32_e32 v0, s4, v0
6644; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6645; GFX8_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6646; GFX8_DPP-NEXT:    s_endpgm
6647;
6648; GFX9_DPP-LABEL: and_i32_varying:
6649; GFX9_DPP:       ; %bb.0: ; %entry
6650; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
6651; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
6652; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
6653; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
6654; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, -1
6655; GFX9_DPP-NEXT:    s_nop 0
6656; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
6657; GFX9_DPP-NEXT:    s_nop 1
6658; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
6659; GFX9_DPP-NEXT:    s_nop 1
6660; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
6661; GFX9_DPP-NEXT:    s_nop 1
6662; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
6663; GFX9_DPP-NEXT:    s_nop 1
6664; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
6665; GFX9_DPP-NEXT:    s_nop 1
6666; GFX9_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
6667; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
6668; GFX9_DPP-NEXT:    s_nop 0
6669; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
6670; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
6671; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6672; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
6673; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
6674; GFX9_DPP-NEXT:    s_cbranch_execz .LBB15_2
6675; GFX9_DPP-NEXT:  ; %bb.1:
6676; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, 0
6677; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, s2
6678; GFX9_DPP-NEXT:    ds_and_rtn_b32 v0, v0, v3
6679; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6680; GFX9_DPP-NEXT:  .LBB15_2:
6681; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
6682; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6683; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
6684; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v1
6685; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
6686; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
6687; GFX9_DPP-NEXT:    v_and_b32_e32 v0, s4, v0
6688; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6689; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6690; GFX9_DPP-NEXT:    s_endpgm
6691;
6692; GFX1064_DPP-LABEL: and_i32_varying:
6693; GFX1064_DPP:       ; %bb.0: ; %entry
6694; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
6695; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
6696; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, -1
6697; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6698; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6699; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6700; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6701; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
6702; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6703; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
6704; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s2
6705; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6706; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6707; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 15
6708; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 31
6709; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 16
6710; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
6711; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6712; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
6713; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 47
6714; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 63
6715; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s3, 32
6716; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
6717; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6718; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
6719; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 48
6720; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
6721; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6722; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
6723; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
6724; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
6725; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB15_2
6726; GFX1064_DPP-NEXT:  ; %bb.1:
6727; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
6728; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, s6
6729; GFX1064_DPP-NEXT:    s_mov_b32 s3, s6
6730; GFX1064_DPP-NEXT:    ds_and_rtn_b32 v0, v0, v4
6731; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6732; GFX1064_DPP-NEXT:    buffer_gl0_inv
6733; GFX1064_DPP-NEXT:  .LBB15_2:
6734; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
6735; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
6736; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6737; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v0
6738; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
6739; GFX1064_DPP-NEXT:    v_and_b32_e32 v0, s3, v0
6740; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
6741; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6742; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6743; GFX1064_DPP-NEXT:    s_endpgm
6744;
6745; GFX1032_DPP-LABEL: and_i32_varying:
6746; GFX1032_DPP:       ; %bb.0: ; %entry
6747; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
6748; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
6749; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, -1
6750; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6751; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6752; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6753; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6754; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
6755; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6756; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
6757; GFX1032_DPP-NEXT:    v_readlane_b32 s2, v1, 31
6758; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6759; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
6760; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6761; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
6762; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
6763; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
6764; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
6765; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
6766; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6767; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
6768; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
6769; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB15_2
6770; GFX1032_DPP-NEXT:  ; %bb.1:
6771; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
6772; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, s0
6773; GFX1032_DPP-NEXT:    ds_and_rtn_b32 v0, v0, v4
6774; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6775; GFX1032_DPP-NEXT:    buffer_gl0_inv
6776; GFX1032_DPP-NEXT:  .LBB15_2:
6777; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
6778; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
6779; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6780; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v0
6781; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
6782; GFX1032_DPP-NEXT:    v_and_b32_e32 v0, s3, v0
6783; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
6784; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6785; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6786; GFX1032_DPP-NEXT:    s_endpgm
6787;
6788; GFX1164_DPP-LABEL: and_i32_varying:
6789; GFX1164_DPP:       ; %bb.0: ; %entry
6790; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
6791; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
6792; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
6793; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
6794; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, -1
6795; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
6796; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6797; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6798; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6799; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6800; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6801; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6802; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
6803; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6804; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6805; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
6806; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s2
6807; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6808; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
6809; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6810; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 15
6811; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 31
6812; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
6813; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 16
6814; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
6815; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6816; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
6817; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 47
6818; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 63
6819; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s3, 32
6820; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
6821; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
6822; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6823; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
6824; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 48
6825; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
6826; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6827; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
6828; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr0
6829; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
6830; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB15_2
6831; GFX1164_DPP-NEXT:  ; %bb.1:
6832; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
6833; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s6
6834; GFX1164_DPP-NEXT:    s_mov_b32 s3, s6
6835; GFX1164_DPP-NEXT:    ds_and_rtn_b32 v0, v0, v4
6836; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6837; GFX1164_DPP-NEXT:    buffer_gl0_inv
6838; GFX1164_DPP-NEXT:  .LBB15_2:
6839; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
6840; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
6841; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v0
6842; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, v3
6843; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6844; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, s3, v0
6845; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
6846; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6847; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6848; GFX1164_DPP-NEXT:    s_endpgm
6849;
6850; GFX1132_DPP-LABEL: and_i32_varying:
6851; GFX1132_DPP:       ; %bb.0: ; %entry
6852; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
6853; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
6854; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
6855; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
6856; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, -1
6857; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
6858; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6859; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
6860; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6861; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
6862; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
6863; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6864; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
6865; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
6866; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
6867; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
6868; GFX1132_DPP-NEXT:    v_readlane_b32 s2, v1, 31
6869; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
6870; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
6871; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6872; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
6873; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
6874; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
6875; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
6876; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
6877; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
6878; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
6879; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
6880; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB15_2
6881; GFX1132_DPP-NEXT:  ; %bb.1:
6882; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
6883; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, s0
6884; GFX1132_DPP-NEXT:    ds_and_rtn_b32 v0, v0, v4
6885; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6886; GFX1132_DPP-NEXT:    buffer_gl0_inv
6887; GFX1132_DPP-NEXT:  .LBB15_2:
6888; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
6889; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
6890; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
6891; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
6892; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6893; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, s3, v0
6894; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
6895; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
6896; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
6897; GFX1132_DPP-NEXT:    s_endpgm
6898entry:
6899  %lane = call i32 @llvm.amdgcn.workitem.id.x()
6900  %old = atomicrmw and ptr addrspace(3) @local_var32, i32 %lane acq_rel
6901  store i32 %old, ptr addrspace(1) %out
6902  ret void
6903}
6904
6905define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
6906; GFX7LESS_ITERATIVE-LABEL: and_i64_varying:
6907; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
6908; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
6909; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
6910; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
6911; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
6912; GFX7LESS_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
6913; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6914; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
6915; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s8
6916; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
6917; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
6918; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
6919; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
6920; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
6921; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
6922; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[8:9], s[2:3], 0
6923; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[8:9]
6924; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
6925; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB16_1
6926; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6927; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
6928; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
6929; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6930; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
6931; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6932; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
6933; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB16_4
6934; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
6935; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
6936; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
6937; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
6938; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
6939; GFX7LESS_ITERATIVE-NEXT:    ds_and_rtn_b64 v[3:4], v0, v[3:4]
6940; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6941; GFX7LESS_ITERATIVE-NEXT:  .LBB16_4:
6942; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
6943; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
6944; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
6945; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6946; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
6947; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
6948; GFX7LESS_ITERATIVE-NEXT:    v_and_b32_e32 v2, s4, v2
6949; GFX7LESS_ITERATIVE-NEXT:    v_and_b32_e32 v1, s5, v1
6950; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6951; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
6952; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
6953;
6954; GFX8_ITERATIVE-LABEL: and_i64_varying:
6955; GFX8_ITERATIVE:       ; %bb.0: ; %entry
6956; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
6957; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
6958; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
6959; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
6960; GFX8_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
6961; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
6962; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
6963; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
6964; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
6965; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
6966; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
6967; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
6968; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
6969; GFX8_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
6970; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
6971; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
6972; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
6973; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
6974; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
6975; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
6976; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
6977; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
6978; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
6979; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
6980; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB16_4
6981; GFX8_ITERATIVE-NEXT:  ; %bb.3:
6982; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
6983; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
6984; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
6985; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
6986; GFX8_ITERATIVE-NEXT:    ds_and_rtn_b64 v[3:4], v0, v[3:4]
6987; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6988; GFX8_ITERATIVE-NEXT:  .LBB16_4:
6989; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
6990; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6991; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
6992; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
6993; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
6994; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
6995; GFX8_ITERATIVE-NEXT:    v_and_b32_e32 v2, s4, v2
6996; GFX8_ITERATIVE-NEXT:    v_and_b32_e32 v1, s5, v1
6997; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
6998; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
6999; GFX8_ITERATIVE-NEXT:    s_endpgm
7000;
7001; GFX9_ITERATIVE-LABEL: and_i64_varying:
7002; GFX9_ITERATIVE:       ; %bb.0: ; %entry
7003; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
7004; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
7005; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
7006; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
7007; GFX9_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
7008; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
7009; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
7010; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
7011; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
7012; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
7013; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
7014; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
7015; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
7016; GFX9_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
7017; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
7018; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
7019; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
7020; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
7021; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7022; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
7023; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
7024; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
7025; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
7026; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
7027; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB16_4
7028; GFX9_ITERATIVE-NEXT:  ; %bb.3:
7029; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
7030; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
7031; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
7032; GFX9_ITERATIVE-NEXT:    ds_and_rtn_b64 v[3:4], v0, v[3:4]
7033; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7034; GFX9_ITERATIVE-NEXT:  .LBB16_4:
7035; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
7036; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7037; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
7038; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
7039; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
7040; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
7041; GFX9_ITERATIVE-NEXT:    v_and_b32_e32 v2, s4, v2
7042; GFX9_ITERATIVE-NEXT:    v_and_b32_e32 v1, s5, v1
7043; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7044; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
7045; GFX9_ITERATIVE-NEXT:    s_endpgm
7046;
7047; GFX1064_ITERATIVE-LABEL: and_i64_varying:
7048; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
7049; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
7050; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
7051; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
7052; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
7053; GFX1064_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
7054; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
7055; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
7056; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
7057; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
7058; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
7059; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
7060; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
7061; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
7062; GFX1064_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
7063; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
7064; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
7065; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
7066; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7067; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
7068; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
7069; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
7070; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
7071; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
7072; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB16_4
7073; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
7074; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
7075; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
7076; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
7077; GFX1064_ITERATIVE-NEXT:    ds_and_rtn_b64 v[3:4], v0, v[3:4]
7078; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7079; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
7080; GFX1064_ITERATIVE-NEXT:  .LBB16_4:
7081; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
7082; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
7083; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7084; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v4
7085; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
7086; GFX1064_ITERATIVE-NEXT:    v_and_b32_e32 v2, s2, v2
7087; GFX1064_ITERATIVE-NEXT:    v_and_b32_e32 v1, s3, v1
7088; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
7089; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
7090; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7091; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
7092; GFX1064_ITERATIVE-NEXT:    s_endpgm
7093;
7094; GFX1032_ITERATIVE-LABEL: and_i64_varying:
7095; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
7096; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
7097; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
7098; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
7099; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
7100; GFX1032_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
7101; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
7102; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
7103; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
7104; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
7105; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
7106; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
7107; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
7108; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
7109; GFX1032_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
7110; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
7111; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
7112; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
7113; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7114; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
7115; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
7116; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
7117; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
7118; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB16_4
7119; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
7120; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
7121; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
7122; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
7123; GFX1032_ITERATIVE-NEXT:    ds_and_rtn_b64 v[3:4], v0, v[3:4]
7124; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7125; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
7126; GFX1032_ITERATIVE-NEXT:  .LBB16_4:
7127; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
7128; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
7129; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7130; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v4
7131; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
7132; GFX1032_ITERATIVE-NEXT:    v_and_b32_e32 v2, s2, v2
7133; GFX1032_ITERATIVE-NEXT:    v_and_b32_e32 v1, s3, v1
7134; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
7135; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
7136; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7137; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
7138; GFX1032_ITERATIVE-NEXT:    s_endpgm
7139;
7140; GFX1164_ITERATIVE-LABEL: and_i64_varying:
7141; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
7142; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
7143; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
7144; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
7145; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
7146; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
7147; GFX1164_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
7148; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
7149; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
7150; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
7151; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
7152; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
7153; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
7154; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
7155; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
7156; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
7157; GFX1164_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
7158; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
7159; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
7160; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
7161; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
7162; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7163; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
7164; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
7165; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
7166; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
7167; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7168; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
7169; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB16_4
7170; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
7171; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s1
7172; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
7173; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
7174; GFX1164_ITERATIVE-NEXT:    ds_and_rtn_b64 v[2:3], v4, v[2:3]
7175; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7176; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
7177; GFX1164_ITERATIVE-NEXT:  .LBB16_4:
7178; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
7179; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
7180; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
7181; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v2
7182; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7183; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v1, s2, v1
7184; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v0, s3, v0
7185; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
7186; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
7187; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7188; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
7189; GFX1164_ITERATIVE-NEXT:    s_endpgm
7190;
7191; GFX1132_ITERATIVE-LABEL: and_i64_varying:
7192; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
7193; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
7194; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
7195; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
7196; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
7197; GFX1132_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
7198; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
7199; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
7200; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
7201; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
7202; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
7203; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
7204; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
7205; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
7206; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
7207; GFX1132_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
7208; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
7209; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
7210; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
7211; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
7212; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
7213; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
7214; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
7215; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
7216; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
7217; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB16_4
7218; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
7219; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1
7220; GFX1132_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
7221; GFX1132_ITERATIVE-NEXT:    ds_and_rtn_b64 v[2:3], v4, v[2:3]
7222; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7223; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
7224; GFX1132_ITERATIVE-NEXT:  .LBB16_4:
7225; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
7226; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
7227; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
7228; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v2
7229; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7230; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v1, s2, v1
7231; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v0, s3, v0
7232; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
7233; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
7234; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7235; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
7236; GFX1132_ITERATIVE-NEXT:    s_endpgm
7237;
7238; GFX7LESS_DPP-LABEL: and_i64_varying:
7239; GFX7LESS_DPP:       ; %bb.0: ; %entry
7240; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
7241; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
7242; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
7243; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7244; GFX7LESS_DPP-NEXT:    ds_and_rtn_b64 v[0:1], v1, v[0:1]
7245; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7246; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
7247; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
7248; GFX7LESS_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
7249; GFX7LESS_DPP-NEXT:    s_endpgm
7250;
7251; GFX8_DPP-LABEL: and_i64_varying:
7252; GFX8_DPP:       ; %bb.0: ; %entry
7253; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
7254; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
7255; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
7256; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
7257; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, 0, s[0:1]
7258; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v0, s[0:1]
7259; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, -1
7260; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
7261; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
7262; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, -1
7263; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
7264; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf
7265; GFX8_DPP-NEXT:    s_nop 0
7266; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
7267; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf
7268; GFX8_DPP-NEXT:    s_nop 0
7269; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
7270; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf
7271; GFX8_DPP-NEXT:    s_nop 0
7272; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
7273; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
7274; GFX8_DPP-NEXT:    s_nop 0
7275; GFX8_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
7276; GFX8_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
7277; GFX8_DPP-NEXT:    v_readlane_b32 s3, v3, 63
7278; GFX8_DPP-NEXT:    v_readlane_b32 s2, v4, 63
7279; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
7280; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
7281; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
7282; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
7283; GFX8_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
7284; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
7285; GFX8_DPP-NEXT:    s_cbranch_execz .LBB16_2
7286; GFX8_DPP-NEXT:  ; %bb.1:
7287; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s3
7288; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, s2
7289; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
7290; GFX8_DPP-NEXT:    ds_and_rtn_b64 v[5:6], v7, v[5:6]
7291; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7292; GFX8_DPP-NEXT:  .LBB16_2:
7293; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
7294; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7295; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v6
7296; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v5
7297; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
7298; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
7299; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
7300; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
7301; GFX8_DPP-NEXT:    v_and_b32_e32 v6, s4, v6
7302; GFX8_DPP-NEXT:    v_and_b32_e32 v5, s5, v5
7303; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7304; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
7305; GFX8_DPP-NEXT:    s_endpgm
7306;
7307; GFX9_DPP-LABEL: and_i64_varying:
7308; GFX9_DPP:       ; %bb.0: ; %entry
7309; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
7310; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
7311; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
7312; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
7313; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, 0, s[0:1]
7314; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v0, s[0:1]
7315; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, -1
7316; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
7317; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
7318; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, -1
7319; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
7320; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf
7321; GFX9_DPP-NEXT:    s_nop 0
7322; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
7323; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf
7324; GFX9_DPP-NEXT:    s_nop 0
7325; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
7326; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf
7327; GFX9_DPP-NEXT:    s_nop 0
7328; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
7329; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
7330; GFX9_DPP-NEXT:    s_nop 0
7331; GFX9_DPP-NEXT:    v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
7332; GFX9_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
7333; GFX9_DPP-NEXT:    v_readlane_b32 s3, v3, 63
7334; GFX9_DPP-NEXT:    v_readlane_b32 s2, v4, 63
7335; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
7336; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
7337; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
7338; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
7339; GFX9_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
7340; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
7341; GFX9_DPP-NEXT:    s_cbranch_execz .LBB16_2
7342; GFX9_DPP-NEXT:  ; %bb.1:
7343; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s3
7344; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, s2
7345; GFX9_DPP-NEXT:    ds_and_rtn_b64 v[5:6], v7, v[5:6]
7346; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7347; GFX9_DPP-NEXT:  .LBB16_2:
7348; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
7349; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7350; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v6
7351; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v5
7352; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
7353; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
7354; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
7355; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
7356; GFX9_DPP-NEXT:    v_and_b32_e32 v6, s4, v6
7357; GFX9_DPP-NEXT:    v_and_b32_e32 v5, s5, v5
7358; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7359; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
7360; GFX9_DPP-NEXT:    s_endpgm
7361;
7362; GFX1064_DPP-LABEL: and_i64_varying:
7363; GFX1064_DPP:       ; %bb.0: ; %entry
7364; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
7365; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s[0:1]
7366; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
7367; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, -1
7368; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, -1
7369; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
7370; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
7371; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
7372; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
7373; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
7374; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
7375; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
7376; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
7377; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
7378; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
7379; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
7380; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
7381; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
7382; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 31
7383; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, s2
7384; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, s3
7385; GFX1064_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
7386; GFX1064_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
7387; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
7388; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7389; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
7390; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
7391; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 15
7392; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
7393; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 15
7394; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 31
7395; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v2, 31
7396; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s2, 16
7397; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 63
7398; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s3, 16
7399; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
7400; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 63
7401; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v2, 47
7402; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s6, 32
7403; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s7, 32
7404; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
7405; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
7406; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
7407; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
7408; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
7409; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s8, 48
7410; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s9, 48
7411; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
7412; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
7413; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
7414; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
7415; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
7416; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB16_2
7417; GFX1064_DPP-NEXT:  ; %bb.1:
7418; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
7419; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
7420; GFX1064_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
7421; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7422; GFX1064_DPP-NEXT:    buffer_gl0_inv
7423; GFX1064_DPP-NEXT:  .LBB16_2:
7424; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
7425; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
7426; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7427; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v8
7428; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
7429; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v6
7430; GFX1064_DPP-NEXT:    s_mov_b32 null, 0
7431; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s4, v7
7432; GFX1064_DPP-NEXT:    v_and_b32_e32 v9, s3, v9
7433; GFX1064_DPP-NEXT:    v_and_b32_e32 v8, s4, v8
7434; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
7435; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7436; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0
7437; GFX1064_DPP-NEXT:    s_endpgm
7438;
7439; GFX1032_DPP-LABEL: and_i64_varying:
7440; GFX1032_DPP:       ; %bb.0: ; %entry
7441; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
7442; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s2
7443; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s2
7444; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, -1
7445; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, -1
7446; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
7447; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
7448; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
7449; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
7450; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
7451; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
7452; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
7453; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
7454; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
7455; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
7456; GFX1032_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
7457; GFX1032_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
7458; GFX1032_DPP-NEXT:    v_readlane_b32 s3, v1, 15
7459; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 31
7460; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v2, 31
7461; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
7462; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
7463; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
7464; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
7465; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
7466; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
7467; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
7468; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s3, 16
7469; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s6, 16
7470; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
7471; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
7472; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
7473; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
7474; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
7475; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB16_2
7476; GFX1032_DPP-NEXT:  ; %bb.1:
7477; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, s1
7478; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, s0
7479; GFX1032_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
7480; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7481; GFX1032_DPP-NEXT:    buffer_gl0_inv
7482; GFX1032_DPP-NEXT:  .LBB16_2:
7483; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
7484; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
7485; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7486; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v8
7487; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
7488; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v6
7489; GFX1032_DPP-NEXT:    s_mov_b32 null, 0
7490; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s4, v7
7491; GFX1032_DPP-NEXT:    v_and_b32_e32 v9, s3, v9
7492; GFX1032_DPP-NEXT:    v_and_b32_e32 v8, s4, v8
7493; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
7494; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7495; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0
7496; GFX1032_DPP-NEXT:    s_endpgm
7497;
7498; GFX1164_DPP-LABEL: and_i64_varying:
7499; GFX1164_DPP:       ; %bb.0: ; %entry
7500; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
7501; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
7502; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
7503; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s[0:1]
7504; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
7505; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, -1
7506; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, -1
7507; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
7508; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
7509; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
7510; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7511; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
7512; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
7513; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7514; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
7515; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
7516; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7517; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
7518; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
7519; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7520; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
7521; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
7522; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7523; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
7524; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
7525; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7526; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
7527; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v2, 31
7528; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7529; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s2
7530; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s3
7531; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7532; GFX1164_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
7533; GFX1164_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
7534; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
7535; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 15
7536; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
7537; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
7538; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7539; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
7540; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
7541; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v2, 15
7542; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 31
7543; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s2, 16
7544; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v2, 31
7545; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 63
7546; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s3, 16
7547; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
7548; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 63
7549; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s6, 32
7550; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
7551; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s7, 32
7552; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
7553; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7554; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
7555; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
7556; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
7557; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
7558; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s8, 48
7559; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s9, 48
7560; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
7561; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
7562; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
7563; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
7564; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
7565; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB16_2
7566; GFX1164_DPP-NEXT:  ; %bb.1:
7567; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
7568; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
7569; GFX1164_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
7570; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7571; GFX1164_DPP-NEXT:    buffer_gl0_inv
7572; GFX1164_DPP-NEXT:  .LBB16_2:
7573; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
7574; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
7575; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v8
7576; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
7577; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v6
7578; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s4, v7
7579; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7580; GFX1164_DPP-NEXT:    v_and_b32_e32 v9, s3, v9
7581; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, s4, v8
7582; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
7583; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7584; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
7585; GFX1164_DPP-NEXT:    s_endpgm
7586;
7587; GFX1132_DPP-LABEL: and_i64_varying:
7588; GFX1132_DPP:       ; %bb.0: ; %entry
7589; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
7590; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
7591; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
7592; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s2
7593; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s2
7594; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, -1 :: v_dual_mov_b32 v5, -1
7595; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
7596; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
7597; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
7598; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7599; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
7600; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
7601; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7602; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
7603; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
7604; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7605; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
7606; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
7607; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7608; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
7609; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
7610; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7611; GFX1132_DPP-NEXT:    v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
7612; GFX1132_DPP-NEXT:    v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
7613; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
7614; GFX1132_DPP-NEXT:    v_readlane_b32 s3, v1, 15
7615; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 31
7616; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v2, 31
7617; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
7618; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
7619; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
7620; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
7621; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7622; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
7623; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
7624; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
7625; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
7626; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
7627; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
7628; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
7629; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
7630; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
7631; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
7632; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB16_2
7633; GFX1132_DPP-NEXT:  ; %bb.1:
7634; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
7635; GFX1132_DPP-NEXT:    ds_and_rtn_b64 v[7:8], v0, v[7:8]
7636; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7637; GFX1132_DPP-NEXT:    buffer_gl0_inv
7638; GFX1132_DPP-NEXT:  .LBB16_2:
7639; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
7640; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
7641; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v8
7642; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
7643; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v6
7644; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
7645; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7646; GFX1132_DPP-NEXT:    v_and_b32_e32 v9, s3, v9
7647; GFX1132_DPP-NEXT:    v_and_b32_e32 v8, s4, v8
7648; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
7649; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7650; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
7651; GFX1132_DPP-NEXT:    s_endpgm
7652entry:
7653  %lane = call i32 @llvm.amdgcn.workitem.id.x()
7654  %lane_ext = zext i32 %lane to i64
7655  %old = atomicrmw and ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
7656  store i64 %old, ptr addrspace(1) %out
7657  ret void
7658}
7659
7660define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
7661; GFX7LESS_ITERATIVE-LABEL: or_i32_varying:
7662; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
7663; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
7664; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, 0
7665; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
7666; GFX7LESS_ITERATIVE-NEXT:  .LBB17_1: ; %ComputeLoop
7667; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
7668; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
7669; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s3
7670; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
7671; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
7672; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
7673; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
7674; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
7675; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
7676; GFX7LESS_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
7677; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB17_1
7678; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
7679; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
7680; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
7681; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
7682; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
7683; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
7684; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
7685; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB17_4
7686; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
7687; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
7688; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
7689; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
7690; GFX7LESS_ITERATIVE-NEXT:    ds_or_rtn_b32 v0, v0, v2
7691; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7692; GFX7LESS_ITERATIVE-NEXT:  .LBB17_4:
7693; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
7694; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
7695; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
7696; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
7697; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
7698; GFX7LESS_ITERATIVE-NEXT:    v_or_b32_e32 v0, s4, v1
7699; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7700; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7701; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
7702;
7703; GFX8_ITERATIVE-LABEL: or_i32_varying:
7704; GFX8_ITERATIVE:       ; %bb.0: ; %entry
7705; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
7706; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, 0
7707; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
7708; GFX8_ITERATIVE-NEXT:  .LBB17_1: ; %ComputeLoop
7709; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
7710; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
7711; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
7712; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
7713; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
7714; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
7715; GFX8_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
7716; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
7717; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
7718; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
7719; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
7720; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7721; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
7722; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
7723; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
7724; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
7725; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
7726; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB17_4
7727; GFX8_ITERATIVE-NEXT:  ; %bb.3:
7728; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
7729; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
7730; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
7731; GFX8_ITERATIVE-NEXT:    ds_or_rtn_b32 v0, v0, v2
7732; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7733; GFX8_ITERATIVE-NEXT:  .LBB17_4:
7734; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
7735; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7736; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
7737; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
7738; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
7739; GFX8_ITERATIVE-NEXT:    v_or_b32_e32 v0, s4, v1
7740; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7741; GFX8_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7742; GFX8_ITERATIVE-NEXT:    s_endpgm
7743;
7744; GFX9_ITERATIVE-LABEL: or_i32_varying:
7745; GFX9_ITERATIVE:       ; %bb.0: ; %entry
7746; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
7747; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, 0
7748; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
7749; GFX9_ITERATIVE-NEXT:  .LBB17_1: ; %ComputeLoop
7750; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
7751; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
7752; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
7753; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
7754; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
7755; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
7756; GFX9_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
7757; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
7758; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
7759; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
7760; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
7761; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7762; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
7763; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
7764; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
7765; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
7766; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
7767; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB17_4
7768; GFX9_ITERATIVE-NEXT:  ; %bb.3:
7769; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
7770; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
7771; GFX9_ITERATIVE-NEXT:    ds_or_rtn_b32 v0, v0, v2
7772; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7773; GFX9_ITERATIVE-NEXT:  .LBB17_4:
7774; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
7775; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7776; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
7777; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
7778; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
7779; GFX9_ITERATIVE-NEXT:    v_or_b32_e32 v0, s4, v1
7780; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7781; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7782; GFX9_ITERATIVE-NEXT:    s_endpgm
7783;
7784; GFX1064_ITERATIVE-LABEL: or_i32_varying:
7785; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
7786; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
7787; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, 0
7788; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
7789; GFX1064_ITERATIVE-NEXT:  .LBB17_1: ; %ComputeLoop
7790; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
7791; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
7792; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
7793; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
7794; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
7795; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
7796; GFX1064_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
7797; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
7798; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
7799; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
7800; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7801; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
7802; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
7803; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
7804; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
7805; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
7806; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB17_4
7807; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
7808; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
7809; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
7810; GFX1064_ITERATIVE-NEXT:    ds_or_rtn_b32 v0, v0, v2
7811; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7812; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
7813; GFX1064_ITERATIVE-NEXT:  .LBB17_4:
7814; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
7815; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
7816; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7817; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
7818; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
7819; GFX1064_ITERATIVE-NEXT:    v_or_b32_e32 v0, s2, v1
7820; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
7821; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7822; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7823; GFX1064_ITERATIVE-NEXT:    s_endpgm
7824;
7825; GFX1032_ITERATIVE-LABEL: or_i32_varying:
7826; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
7827; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
7828; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, 0
7829; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
7830; GFX1032_ITERATIVE-NEXT:  .LBB17_1: ; %ComputeLoop
7831; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
7832; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
7833; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
7834; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
7835; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
7836; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
7837; GFX1032_ITERATIVE-NEXT:    s_or_b32 s0, s0, s3
7838; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
7839; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
7840; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
7841; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7842; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
7843; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
7844; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
7845; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
7846; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB17_4
7847; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
7848; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
7849; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
7850; GFX1032_ITERATIVE-NEXT:    ds_or_rtn_b32 v0, v0, v2
7851; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7852; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
7853; GFX1032_ITERATIVE-NEXT:  .LBB17_4:
7854; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
7855; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
7856; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7857; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
7858; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
7859; GFX1032_ITERATIVE-NEXT:    v_or_b32_e32 v0, s2, v1
7860; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
7861; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7862; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7863; GFX1032_ITERATIVE-NEXT:    s_endpgm
7864;
7865; GFX1164_ITERATIVE-LABEL: or_i32_varying:
7866; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
7867; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
7868; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
7869; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, 0
7870; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
7871; GFX1164_ITERATIVE-NEXT:  .LBB17_1: ; %ComputeLoop
7872; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
7873; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
7874; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
7875; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
7876; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
7877; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
7878; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
7879; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
7880; GFX1164_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
7881; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
7882; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
7883; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
7884; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
7885; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7886; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
7887; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
7888; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
7889; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
7890; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7891; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
7892; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB17_4
7893; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
7894; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
7895; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
7896; GFX1164_ITERATIVE-NEXT:    ds_or_rtn_b32 v1, v1, v2
7897; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7898; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
7899; GFX1164_ITERATIVE-NEXT:  .LBB17_4:
7900; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
7901; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
7902; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
7903; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
7904; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7905; GFX1164_ITERATIVE-NEXT:    v_or_b32_e32 v0, s2, v0
7906; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
7907; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7908; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
7909; GFX1164_ITERATIVE-NEXT:    s_endpgm
7910;
7911; GFX1132_ITERATIVE-LABEL: or_i32_varying:
7912; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
7913; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
7914; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
7915; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, 0
7916; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
7917; GFX1132_ITERATIVE-NEXT:  .LBB17_1: ; %ComputeLoop
7918; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
7919; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
7920; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
7921; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
7922; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
7923; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
7924; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
7925; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
7926; GFX1132_ITERATIVE-NEXT:    s_or_b32 s0, s0, s3
7927; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
7928; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
7929; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
7930; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
7931; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
7932; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
7933; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
7934; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
7935; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
7936; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB17_4
7937; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
7938; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
7939; GFX1132_ITERATIVE-NEXT:    ds_or_rtn_b32 v1, v1, v2
7940; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7941; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
7942; GFX1132_ITERATIVE-NEXT:  .LBB17_4:
7943; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
7944; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
7945; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
7946; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
7947; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7948; GFX1132_ITERATIVE-NEXT:    v_or_b32_e32 v0, s2, v0
7949; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
7950; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
7951; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
7952; GFX1132_ITERATIVE-NEXT:    s_endpgm
7953;
7954; GFX7LESS_DPP-LABEL: or_i32_varying:
7955; GFX7LESS_DPP:       ; %bb.0: ; %entry
7956; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
7957; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
7958; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
7959; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7960; GFX7LESS_DPP-NEXT:    ds_or_rtn_b32 v0, v1, v0
7961; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
7962; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
7963; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
7964; GFX7LESS_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7965; GFX7LESS_DPP-NEXT:    s_endpgm
7966;
7967; GFX8_DPP-LABEL: or_i32_varying:
7968; GFX8_DPP:       ; %bb.0: ; %entry
7969; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
7970; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
7971; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
7972; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
7973; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
7974; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
7975; GFX8_DPP-NEXT:    s_nop 0
7976; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
7977; GFX8_DPP-NEXT:    s_nop 1
7978; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
7979; GFX8_DPP-NEXT:    s_nop 1
7980; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
7981; GFX8_DPP-NEXT:    s_nop 1
7982; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
7983; GFX8_DPP-NEXT:    s_nop 1
7984; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
7985; GFX8_DPP-NEXT:    s_nop 1
7986; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
7987; GFX8_DPP-NEXT:    v_readlane_b32 s2, v1, 63
7988; GFX8_DPP-NEXT:    s_nop 0
7989; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
7990; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
7991; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
7992; GFX8_DPP-NEXT:    ; implicit-def: $vgpr0
7993; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
7994; GFX8_DPP-NEXT:    s_cbranch_execz .LBB17_2
7995; GFX8_DPP-NEXT:  ; %bb.1:
7996; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s2
7997; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
7998; GFX8_DPP-NEXT:    ds_or_rtn_b32 v0, v3, v0
7999; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8000; GFX8_DPP-NEXT:  .LBB17_2:
8001; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
8002; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8003; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v0
8004; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, v2
8005; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
8006; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
8007; GFX8_DPP-NEXT:    v_or_b32_e32 v0, s4, v0
8008; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8009; GFX8_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
8010; GFX8_DPP-NEXT:    s_endpgm
8011;
8012; GFX9_DPP-LABEL: or_i32_varying:
8013; GFX9_DPP:       ; %bb.0: ; %entry
8014; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
8015; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
8016; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
8017; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
8018; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
8019; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
8020; GFX9_DPP-NEXT:    s_nop 0
8021; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8022; GFX9_DPP-NEXT:    s_nop 1
8023; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8024; GFX9_DPP-NEXT:    s_nop 1
8025; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8026; GFX9_DPP-NEXT:    s_nop 1
8027; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8028; GFX9_DPP-NEXT:    s_nop 1
8029; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
8030; GFX9_DPP-NEXT:    s_nop 1
8031; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
8032; GFX9_DPP-NEXT:    v_readlane_b32 s2, v1, 63
8033; GFX9_DPP-NEXT:    s_nop 0
8034; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
8035; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
8036; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
8037; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
8038; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
8039; GFX9_DPP-NEXT:    s_cbranch_execz .LBB17_2
8040; GFX9_DPP-NEXT:  ; %bb.1:
8041; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s2
8042; GFX9_DPP-NEXT:    ds_or_rtn_b32 v0, v3, v0
8043; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8044; GFX9_DPP-NEXT:  .LBB17_2:
8045; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
8046; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8047; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
8048; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v2
8049; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
8050; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
8051; GFX9_DPP-NEXT:    v_or_b32_e32 v0, s4, v0
8052; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8053; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
8054; GFX9_DPP-NEXT:    s_endpgm
8055;
8056; GFX1064_DPP-LABEL: or_i32_varying:
8057; GFX1064_DPP:       ; %bb.0: ; %entry
8058; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
8059; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
8060; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
8061; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8062; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8063; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8064; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8065; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
8066; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
8067; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
8068; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s2
8069; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
8070; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
8071; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 15
8072; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 31
8073; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 16
8074; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
8075; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8076; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
8077; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 47
8078; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 63
8079; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s3, 32
8080; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
8081; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
8082; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
8083; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
8084; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 48
8085; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
8086; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
8087; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
8088; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
8089; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
8090; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB17_2
8091; GFX1064_DPP-NEXT:  ; %bb.1:
8092; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s6
8093; GFX1064_DPP-NEXT:    s_mov_b32 s3, s6
8094; GFX1064_DPP-NEXT:    ds_or_rtn_b32 v0, v4, v0
8095; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8096; GFX1064_DPP-NEXT:    buffer_gl0_inv
8097; GFX1064_DPP-NEXT:  .LBB17_2:
8098; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
8099; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
8100; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8101; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v0
8102; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
8103; GFX1064_DPP-NEXT:    v_or_b32_e32 v0, s3, v0
8104; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
8105; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8106; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
8107; GFX1064_DPP-NEXT:    s_endpgm
8108;
8109; GFX1032_DPP-LABEL: or_i32_varying:
8110; GFX1032_DPP:       ; %bb.0: ; %entry
8111; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
8112; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
8113; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
8114; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8115; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8116; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8117; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8118; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
8119; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
8120; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
8121; GFX1032_DPP-NEXT:    v_readlane_b32 s2, v1, 31
8122; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
8123; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
8124; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8125; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
8126; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
8127; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
8128; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
8129; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
8130; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
8131; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
8132; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
8133; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
8134; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB17_2
8135; GFX1032_DPP-NEXT:  ; %bb.1:
8136; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s0
8137; GFX1032_DPP-NEXT:    ds_or_rtn_b32 v0, v4, v0
8138; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8139; GFX1032_DPP-NEXT:    buffer_gl0_inv
8140; GFX1032_DPP-NEXT:  .LBB17_2:
8141; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
8142; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
8143; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8144; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v0
8145; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
8146; GFX1032_DPP-NEXT:    v_or_b32_e32 v0, s3, v0
8147; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
8148; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8149; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
8150; GFX1032_DPP-NEXT:    s_endpgm
8151;
8152; GFX1164_DPP-LABEL: or_i32_varying:
8153; GFX1164_DPP:       ; %bb.0: ; %entry
8154; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
8155; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
8156; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
8157; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
8158; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
8159; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
8160; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8161; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8162; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8163; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8164; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8165; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8166; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
8167; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
8168; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8169; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
8170; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s2
8171; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8172; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
8173; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
8174; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 15
8175; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 31
8176; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
8177; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 16
8178; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
8179; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8180; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
8181; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 47
8182; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 63
8183; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s3, 32
8184; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
8185; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8186; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
8187; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
8188; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
8189; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 48
8190; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
8191; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
8192; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
8193; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr0
8194; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
8195; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB17_2
8196; GFX1164_DPP-NEXT:  ; %bb.1:
8197; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, s6
8198; GFX1164_DPP-NEXT:    s_mov_b32 s3, s6
8199; GFX1164_DPP-NEXT:    ds_or_rtn_b32 v0, v4, v0
8200; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8201; GFX1164_DPP-NEXT:    buffer_gl0_inv
8202; GFX1164_DPP-NEXT:  .LBB17_2:
8203; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
8204; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
8205; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v0
8206; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, v3
8207; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8208; GFX1164_DPP-NEXT:    v_or_b32_e32 v0, s3, v0
8209; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
8210; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8211; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
8212; GFX1164_DPP-NEXT:    s_endpgm
8213;
8214; GFX1132_DPP-LABEL: or_i32_varying:
8215; GFX1132_DPP:       ; %bb.0: ; %entry
8216; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
8217; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
8218; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
8219; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
8220; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
8221; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
8222; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8223; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8224; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8225; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8226; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8227; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8228; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
8229; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
8230; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
8231; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
8232; GFX1132_DPP-NEXT:    v_readlane_b32 s2, v1, 31
8233; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
8234; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
8235; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8236; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
8237; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
8238; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
8239; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
8240; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
8241; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
8242; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
8243; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
8244; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
8245; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB17_2
8246; GFX1132_DPP-NEXT:  ; %bb.1:
8247; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
8248; GFX1132_DPP-NEXT:    ds_or_rtn_b32 v0, v4, v0
8249; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8250; GFX1132_DPP-NEXT:    buffer_gl0_inv
8251; GFX1132_DPP-NEXT:  .LBB17_2:
8252; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
8253; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
8254; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
8255; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
8256; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8257; GFX1132_DPP-NEXT:    v_or_b32_e32 v0, s3, v0
8258; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
8259; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8260; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
8261; GFX1132_DPP-NEXT:    s_endpgm
8262entry:
8263  %lane = call i32 @llvm.amdgcn.workitem.id.x()
8264  %old = atomicrmw or ptr addrspace(3) @local_var32, i32 %lane acq_rel
8265  store i32 %old, ptr addrspace(1) %out
8266  ret void
8267}
8268
8269define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
8270; GFX7LESS_ITERATIVE-LABEL: or_i64_varying:
8271; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
8272; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
8273; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
8274; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
8275; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
8276; GFX7LESS_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
8277; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
8278; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
8279; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s8
8280; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
8281; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
8282; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
8283; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
8284; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
8285; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
8286; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[8:9], s[2:3], 0
8287; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[8:9]
8288; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
8289; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB18_1
8290; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
8291; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
8292; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
8293; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
8294; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
8295; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
8296; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
8297; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB18_4
8298; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
8299; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
8300; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
8301; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
8302; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
8303; GFX7LESS_ITERATIVE-NEXT:    ds_or_rtn_b64 v[3:4], v0, v[3:4]
8304; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8305; GFX7LESS_ITERATIVE-NEXT:  .LBB18_4:
8306; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
8307; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
8308; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
8309; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
8310; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
8311; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
8312; GFX7LESS_ITERATIVE-NEXT:    v_or_b32_e32 v2, s4, v2
8313; GFX7LESS_ITERATIVE-NEXT:    v_or_b32_e32 v1, s5, v1
8314; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8315; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
8316; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
8317;
8318; GFX8_ITERATIVE-LABEL: or_i64_varying:
8319; GFX8_ITERATIVE:       ; %bb.0: ; %entry
8320; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
8321; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
8322; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
8323; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
8324; GFX8_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
8325; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
8326; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
8327; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
8328; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
8329; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
8330; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
8331; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
8332; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
8333; GFX8_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
8334; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
8335; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
8336; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
8337; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
8338; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8339; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
8340; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
8341; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
8342; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
8343; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
8344; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB18_4
8345; GFX8_ITERATIVE-NEXT:  ; %bb.3:
8346; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
8347; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
8348; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
8349; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
8350; GFX8_ITERATIVE-NEXT:    ds_or_rtn_b64 v[3:4], v0, v[3:4]
8351; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8352; GFX8_ITERATIVE-NEXT:  .LBB18_4:
8353; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
8354; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8355; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
8356; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
8357; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
8358; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
8359; GFX8_ITERATIVE-NEXT:    v_or_b32_e32 v2, s4, v2
8360; GFX8_ITERATIVE-NEXT:    v_or_b32_e32 v1, s5, v1
8361; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8362; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
8363; GFX8_ITERATIVE-NEXT:    s_endpgm
8364;
8365; GFX9_ITERATIVE-LABEL: or_i64_varying:
8366; GFX9_ITERATIVE:       ; %bb.0: ; %entry
8367; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
8368; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
8369; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
8370; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
8371; GFX9_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
8372; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
8373; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
8374; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
8375; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
8376; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
8377; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
8378; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
8379; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
8380; GFX9_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
8381; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
8382; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
8383; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
8384; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
8385; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8386; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
8387; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
8388; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
8389; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
8390; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
8391; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB18_4
8392; GFX9_ITERATIVE-NEXT:  ; %bb.3:
8393; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
8394; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
8395; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
8396; GFX9_ITERATIVE-NEXT:    ds_or_rtn_b64 v[3:4], v0, v[3:4]
8397; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8398; GFX9_ITERATIVE-NEXT:  .LBB18_4:
8399; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
8400; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8401; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
8402; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
8403; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
8404; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
8405; GFX9_ITERATIVE-NEXT:    v_or_b32_e32 v2, s4, v2
8406; GFX9_ITERATIVE-NEXT:    v_or_b32_e32 v1, s5, v1
8407; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8408; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
8409; GFX9_ITERATIVE-NEXT:    s_endpgm
8410;
8411; GFX1064_ITERATIVE-LABEL: or_i64_varying:
8412; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
8413; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
8414; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
8415; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
8416; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
8417; GFX1064_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
8418; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
8419; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
8420; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
8421; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
8422; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
8423; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
8424; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
8425; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
8426; GFX1064_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
8427; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
8428; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
8429; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
8430; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8431; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
8432; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
8433; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
8434; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
8435; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
8436; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB18_4
8437; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
8438; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
8439; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
8440; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
8441; GFX1064_ITERATIVE-NEXT:    ds_or_rtn_b64 v[3:4], v0, v[3:4]
8442; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8443; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
8444; GFX1064_ITERATIVE-NEXT:  .LBB18_4:
8445; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
8446; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
8447; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8448; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v4
8449; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
8450; GFX1064_ITERATIVE-NEXT:    v_or_b32_e32 v2, s2, v2
8451; GFX1064_ITERATIVE-NEXT:    v_or_b32_e32 v1, s3, v1
8452; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
8453; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
8454; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8455; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
8456; GFX1064_ITERATIVE-NEXT:    s_endpgm
8457;
8458; GFX1032_ITERATIVE-LABEL: or_i64_varying:
8459; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
8460; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
8461; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
8462; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
8463; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
8464; GFX1032_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
8465; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
8466; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
8467; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
8468; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
8469; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
8470; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
8471; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
8472; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
8473; GFX1032_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
8474; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
8475; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
8476; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
8477; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8478; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
8479; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
8480; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
8481; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
8482; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB18_4
8483; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
8484; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
8485; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
8486; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
8487; GFX1032_ITERATIVE-NEXT:    ds_or_rtn_b64 v[3:4], v0, v[3:4]
8488; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8489; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
8490; GFX1032_ITERATIVE-NEXT:  .LBB18_4:
8491; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
8492; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
8493; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8494; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v4
8495; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
8496; GFX1032_ITERATIVE-NEXT:    v_or_b32_e32 v2, s2, v2
8497; GFX1032_ITERATIVE-NEXT:    v_or_b32_e32 v1, s3, v1
8498; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
8499; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
8500; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8501; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
8502; GFX1032_ITERATIVE-NEXT:    s_endpgm
8503;
8504; GFX1164_ITERATIVE-LABEL: or_i64_varying:
8505; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
8506; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
8507; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
8508; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
8509; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
8510; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
8511; GFX1164_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
8512; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
8513; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
8514; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
8515; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
8516; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
8517; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
8518; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
8519; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
8520; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
8521; GFX1164_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
8522; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
8523; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
8524; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
8525; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
8526; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8527; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
8528; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
8529; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
8530; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
8531; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8532; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
8533; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB18_4
8534; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
8535; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s1
8536; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
8537; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
8538; GFX1164_ITERATIVE-NEXT:    ds_or_rtn_b64 v[2:3], v4, v[2:3]
8539; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8540; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
8541; GFX1164_ITERATIVE-NEXT:  .LBB18_4:
8542; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
8543; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
8544; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
8545; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v2
8546; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8547; GFX1164_ITERATIVE-NEXT:    v_or_b32_e32 v1, s2, v1
8548; GFX1164_ITERATIVE-NEXT:    v_or_b32_e32 v0, s3, v0
8549; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
8550; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
8551; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8552; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
8553; GFX1164_ITERATIVE-NEXT:    s_endpgm
8554;
8555; GFX1132_ITERATIVE-LABEL: or_i64_varying:
8556; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
8557; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
8558; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
8559; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
8560; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
8561; GFX1132_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
8562; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
8563; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
8564; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
8565; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
8566; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
8567; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
8568; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
8569; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
8570; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
8571; GFX1132_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
8572; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
8573; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
8574; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
8575; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
8576; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
8577; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
8578; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
8579; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
8580; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
8581; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB18_4
8582; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
8583; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1
8584; GFX1132_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
8585; GFX1132_ITERATIVE-NEXT:    ds_or_rtn_b64 v[2:3], v4, v[2:3]
8586; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8587; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
8588; GFX1132_ITERATIVE-NEXT:  .LBB18_4:
8589; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
8590; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
8591; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
8592; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v2
8593; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8594; GFX1132_ITERATIVE-NEXT:    v_or_b32_e32 v1, s2, v1
8595; GFX1132_ITERATIVE-NEXT:    v_or_b32_e32 v0, s3, v0
8596; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
8597; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
8598; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
8599; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
8600; GFX1132_ITERATIVE-NEXT:    s_endpgm
8601;
8602; GFX7LESS_DPP-LABEL: or_i64_varying:
8603; GFX7LESS_DPP:       ; %bb.0: ; %entry
8604; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
8605; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
8606; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
8607; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8608; GFX7LESS_DPP-NEXT:    ds_or_rtn_b64 v[0:1], v1, v[0:1]
8609; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8610; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
8611; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
8612; GFX7LESS_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
8613; GFX7LESS_DPP-NEXT:    s_endpgm
8614;
8615; GFX8_DPP-LABEL: or_i64_varying:
8616; GFX8_DPP:       ; %bb.0: ; %entry
8617; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
8618; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
8619; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
8620; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
8621; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
8622; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
8623; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
8624; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8625; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8626; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
8627; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8628; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8629; GFX8_DPP-NEXT:    s_nop 0
8630; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8631; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8632; GFX8_DPP-NEXT:    s_nop 0
8633; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8634; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8635; GFX8_DPP-NEXT:    s_nop 0
8636; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
8637; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
8638; GFX8_DPP-NEXT:    s_nop 0
8639; GFX8_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
8640; GFX8_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
8641; GFX8_DPP-NEXT:    v_readlane_b32 s3, v1, 63
8642; GFX8_DPP-NEXT:    v_readlane_b32 s2, v2, 63
8643; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
8644; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
8645; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
8646; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
8647; GFX8_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
8648; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
8649; GFX8_DPP-NEXT:    s_cbranch_execz .LBB18_2
8650; GFX8_DPP-NEXT:  ; %bb.1:
8651; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s3
8652; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, s2
8653; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
8654; GFX8_DPP-NEXT:    ds_or_rtn_b64 v[5:6], v7, v[5:6]
8655; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8656; GFX8_DPP-NEXT:  .LBB18_2:
8657; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
8658; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8659; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v6
8660; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v5
8661; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
8662; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
8663; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
8664; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
8665; GFX8_DPP-NEXT:    v_or_b32_e32 v6, s4, v6
8666; GFX8_DPP-NEXT:    v_or_b32_e32 v5, s5, v5
8667; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8668; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
8669; GFX8_DPP-NEXT:    s_endpgm
8670;
8671; GFX9_DPP-LABEL: or_i64_varying:
8672; GFX9_DPP:       ; %bb.0: ; %entry
8673; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
8674; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
8675; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
8676; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
8677; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
8678; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
8679; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
8680; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8681; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8682; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
8683; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8684; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8685; GFX9_DPP-NEXT:    s_nop 0
8686; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8687; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8688; GFX9_DPP-NEXT:    s_nop 0
8689; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8690; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8691; GFX9_DPP-NEXT:    s_nop 0
8692; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
8693; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
8694; GFX9_DPP-NEXT:    s_nop 0
8695; GFX9_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
8696; GFX9_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
8697; GFX9_DPP-NEXT:    v_readlane_b32 s3, v1, 63
8698; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
8699; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
8700; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
8701; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
8702; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
8703; GFX9_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
8704; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
8705; GFX9_DPP-NEXT:    s_cbranch_execz .LBB18_2
8706; GFX9_DPP-NEXT:  ; %bb.1:
8707; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s3
8708; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, s2
8709; GFX9_DPP-NEXT:    ds_or_rtn_b64 v[5:6], v7, v[5:6]
8710; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8711; GFX9_DPP-NEXT:  .LBB18_2:
8712; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
8713; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8714; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v6
8715; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v5
8716; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
8717; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
8718; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
8719; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
8720; GFX9_DPP-NEXT:    v_or_b32_e32 v6, s4, v6
8721; GFX9_DPP-NEXT:    v_or_b32_e32 v5, s5, v5
8722; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8723; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
8724; GFX9_DPP-NEXT:    s_endpgm
8725;
8726; GFX1064_DPP-LABEL: or_i64_varying:
8727; GFX1064_DPP:       ; %bb.0: ; %entry
8728; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
8729; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
8730; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
8731; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
8732; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
8733; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8734; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8735; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8736; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8737; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8738; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8739; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8740; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8741; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
8742; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
8743; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
8744; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
8745; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
8746; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 31
8747; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, s2
8748; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, s3
8749; GFX1064_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
8750; GFX1064_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
8751; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
8752; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8753; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
8754; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
8755; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 15
8756; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
8757; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 15
8758; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 31
8759; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v2, 31
8760; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s2, 16
8761; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 63
8762; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s3, 16
8763; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
8764; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 63
8765; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v2, 47
8766; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s6, 32
8767; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s7, 32
8768; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
8769; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
8770; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
8771; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
8772; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
8773; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s8, 48
8774; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s9, 48
8775; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
8776; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
8777; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
8778; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
8779; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
8780; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB18_2
8781; GFX1064_DPP-NEXT:  ; %bb.1:
8782; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
8783; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
8784; GFX1064_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
8785; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8786; GFX1064_DPP-NEXT:    buffer_gl0_inv
8787; GFX1064_DPP-NEXT:  .LBB18_2:
8788; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
8789; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
8790; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8791; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v8
8792; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
8793; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v6
8794; GFX1064_DPP-NEXT:    s_mov_b32 null, 0
8795; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s4, v7
8796; GFX1064_DPP-NEXT:    v_or_b32_e32 v9, s3, v9
8797; GFX1064_DPP-NEXT:    v_or_b32_e32 v8, s4, v8
8798; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
8799; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8800; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0
8801; GFX1064_DPP-NEXT:    s_endpgm
8802;
8803; GFX1032_DPP-LABEL: or_i64_varying:
8804; GFX1032_DPP:       ; %bb.0: ; %entry
8805; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
8806; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s2
8807; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s2
8808; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
8809; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
8810; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8811; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8812; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8813; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8814; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8815; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8816; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8817; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8818; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
8819; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
8820; GFX1032_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
8821; GFX1032_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
8822; GFX1032_DPP-NEXT:    v_readlane_b32 s3, v1, 15
8823; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 31
8824; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v2, 31
8825; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
8826; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
8827; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
8828; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
8829; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
8830; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
8831; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
8832; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s3, 16
8833; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s6, 16
8834; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
8835; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
8836; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
8837; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
8838; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
8839; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB18_2
8840; GFX1032_DPP-NEXT:  ; %bb.1:
8841; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, s1
8842; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, s0
8843; GFX1032_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
8844; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8845; GFX1032_DPP-NEXT:    buffer_gl0_inv
8846; GFX1032_DPP-NEXT:  .LBB18_2:
8847; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
8848; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
8849; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8850; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v8
8851; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
8852; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v6
8853; GFX1032_DPP-NEXT:    s_mov_b32 null, 0
8854; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s4, v7
8855; GFX1032_DPP-NEXT:    v_or_b32_e32 v9, s3, v9
8856; GFX1032_DPP-NEXT:    v_or_b32_e32 v8, s4, v8
8857; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
8858; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8859; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0
8860; GFX1032_DPP-NEXT:    s_endpgm
8861;
8862; GFX1164_DPP-LABEL: or_i64_varying:
8863; GFX1164_DPP:       ; %bb.0: ; %entry
8864; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
8865; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
8866; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
8867; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
8868; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
8869; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, 0
8870; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
8871; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
8872; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8873; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8874; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8875; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8876; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8877; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8878; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8879; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8880; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8881; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8882; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8883; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8884; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
8885; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
8886; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8887; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
8888; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
8889; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8890; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
8891; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v2, 31
8892; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8893; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s2
8894; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s3
8895; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8896; GFX1164_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
8897; GFX1164_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
8898; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
8899; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 15
8900; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
8901; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
8902; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8903; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
8904; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
8905; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v2, 15
8906; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 31
8907; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s2, 16
8908; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v2, 31
8909; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 63
8910; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s3, 16
8911; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
8912; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 63
8913; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s6, 32
8914; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
8915; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s7, 32
8916; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
8917; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8918; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
8919; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
8920; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
8921; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
8922; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s8, 48
8923; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s9, 48
8924; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
8925; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
8926; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
8927; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
8928; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
8929; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB18_2
8930; GFX1164_DPP-NEXT:  ; %bb.1:
8931; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
8932; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
8933; GFX1164_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
8934; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8935; GFX1164_DPP-NEXT:    buffer_gl0_inv
8936; GFX1164_DPP-NEXT:  .LBB18_2:
8937; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
8938; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
8939; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v8
8940; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
8941; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v6
8942; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s4, v7
8943; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8944; GFX1164_DPP-NEXT:    v_or_b32_e32 v9, s3, v9
8945; GFX1164_DPP-NEXT:    v_or_b32_e32 v8, s4, v8
8946; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
8947; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
8948; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
8949; GFX1164_DPP-NEXT:    s_endpgm
8950;
8951; GFX1132_DPP-LABEL: or_i64_varying:
8952; GFX1132_DPP:       ; %bb.0: ; %entry
8953; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
8954; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
8955; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
8956; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s2
8957; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s2
8958; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0
8959; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
8960; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8961; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
8962; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8963; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8964; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
8965; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8966; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8967; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
8968; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8969; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8970; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
8971; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8972; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
8973; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
8974; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
8975; GFX1132_DPP-NEXT:    v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
8976; GFX1132_DPP-NEXT:    v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
8977; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
8978; GFX1132_DPP-NEXT:    v_readlane_b32 s3, v1, 15
8979; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 31
8980; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v2, 31
8981; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
8982; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
8983; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
8984; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
8985; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8986; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
8987; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
8988; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
8989; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
8990; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
8991; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
8992; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
8993; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
8994; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
8995; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
8996; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB18_2
8997; GFX1132_DPP-NEXT:  ; %bb.1:
8998; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
8999; GFX1132_DPP-NEXT:    ds_or_rtn_b64 v[7:8], v0, v[7:8]
9000; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9001; GFX1132_DPP-NEXT:    buffer_gl0_inv
9002; GFX1132_DPP-NEXT:  .LBB18_2:
9003; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
9004; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
9005; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v8
9006; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
9007; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v6
9008; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
9009; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
9010; GFX1132_DPP-NEXT:    v_or_b32_e32 v9, s3, v9
9011; GFX1132_DPP-NEXT:    v_or_b32_e32 v8, s4, v8
9012; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
9013; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9014; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
9015; GFX1132_DPP-NEXT:    s_endpgm
9016entry:
9017  %lane = call i32 @llvm.amdgcn.workitem.id.x()
9018  %lane_ext = zext i32 %lane to i64
9019  %old = atomicrmw or ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
9020  store i64 %old, ptr addrspace(1) %out
9021  ret void
9022}
9023
9024define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
9025; GFX7LESS_ITERATIVE-LABEL: xor_i32_varying:
9026; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
9027; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
9028; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, 0
9029; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
9030; GFX7LESS_ITERATIVE-NEXT:  .LBB19_1: ; %ComputeLoop
9031; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9032; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
9033; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s3
9034; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
9035; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
9036; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
9037; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
9038; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
9039; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
9040; GFX7LESS_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
9041; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB19_1
9042; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9043; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
9044; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
9045; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
9046; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
9047; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
9048; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
9049; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB19_4
9050; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
9051; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
9052; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
9053; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
9054; GFX7LESS_ITERATIVE-NEXT:    ds_xor_rtn_b32 v0, v0, v2
9055; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9056; GFX7LESS_ITERATIVE-NEXT:  .LBB19_4:
9057; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
9058; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
9059; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
9060; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9061; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
9062; GFX7LESS_ITERATIVE-NEXT:    v_xor_b32_e32 v0, s4, v1
9063; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9064; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9065; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
9066;
9067; GFX8_ITERATIVE-LABEL: xor_i32_varying:
9068; GFX8_ITERATIVE:       ; %bb.0: ; %entry
9069; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
9070; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, 0
9071; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
9072; GFX8_ITERATIVE-NEXT:  .LBB19_1: ; %ComputeLoop
9073; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9074; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
9075; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
9076; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
9077; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
9078; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
9079; GFX8_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
9080; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
9081; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
9082; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
9083; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9084; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9085; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
9086; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
9087; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
9088; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
9089; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
9090; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB19_4
9091; GFX8_ITERATIVE-NEXT:  ; %bb.3:
9092; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
9093; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
9094; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
9095; GFX8_ITERATIVE-NEXT:    ds_xor_rtn_b32 v0, v0, v2
9096; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9097; GFX8_ITERATIVE-NEXT:  .LBB19_4:
9098; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
9099; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
9100; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
9101; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
9102; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9103; GFX8_ITERATIVE-NEXT:    v_xor_b32_e32 v0, s4, v1
9104; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9105; GFX8_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9106; GFX8_ITERATIVE-NEXT:    s_endpgm
9107;
9108; GFX9_ITERATIVE-LABEL: xor_i32_varying:
9109; GFX9_ITERATIVE:       ; %bb.0: ; %entry
9110; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
9111; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, 0
9112; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
9113; GFX9_ITERATIVE-NEXT:  .LBB19_1: ; %ComputeLoop
9114; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9115; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
9116; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
9117; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
9118; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
9119; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
9120; GFX9_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
9121; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
9122; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
9123; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
9124; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9125; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9126; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
9127; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
9128; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
9129; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
9130; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
9131; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB19_4
9132; GFX9_ITERATIVE-NEXT:  ; %bb.3:
9133; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
9134; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
9135; GFX9_ITERATIVE-NEXT:    ds_xor_rtn_b32 v0, v0, v2
9136; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9137; GFX9_ITERATIVE-NEXT:  .LBB19_4:
9138; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
9139; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
9140; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
9141; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
9142; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9143; GFX9_ITERATIVE-NEXT:    v_xor_b32_e32 v0, s4, v1
9144; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9145; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9146; GFX9_ITERATIVE-NEXT:    s_endpgm
9147;
9148; GFX1064_ITERATIVE-LABEL: xor_i32_varying:
9149; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
9150; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
9151; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, 0
9152; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
9153; GFX1064_ITERATIVE-NEXT:  .LBB19_1: ; %ComputeLoop
9154; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9155; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
9156; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
9157; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
9158; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
9159; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
9160; GFX1064_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
9161; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
9162; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
9163; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9164; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9165; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
9166; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
9167; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
9168; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
9169; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
9170; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB19_4
9171; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
9172; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
9173; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
9174; GFX1064_ITERATIVE-NEXT:    ds_xor_rtn_b32 v0, v0, v2
9175; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9176; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
9177; GFX1064_ITERATIVE-NEXT:  .LBB19_4:
9178; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
9179; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
9180; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
9181; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
9182; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
9183; GFX1064_ITERATIVE-NEXT:    v_xor_b32_e32 v0, s2, v1
9184; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9185; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9186; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9187; GFX1064_ITERATIVE-NEXT:    s_endpgm
9188;
9189; GFX1032_ITERATIVE-LABEL: xor_i32_varying:
9190; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
9191; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
9192; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, 0
9193; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
9194; GFX1032_ITERATIVE-NEXT:  .LBB19_1: ; %ComputeLoop
9195; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9196; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
9197; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
9198; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
9199; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
9200; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
9201; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s0, s0, s3
9202; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
9203; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
9204; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9205; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9206; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
9207; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
9208; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
9209; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
9210; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB19_4
9211; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
9212; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
9213; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
9214; GFX1032_ITERATIVE-NEXT:    ds_xor_rtn_b32 v0, v0, v2
9215; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9216; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
9217; GFX1032_ITERATIVE-NEXT:  .LBB19_4:
9218; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
9219; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
9220; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
9221; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
9222; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
9223; GFX1032_ITERATIVE-NEXT:    v_xor_b32_e32 v0, s2, v1
9224; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9225; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9226; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9227; GFX1032_ITERATIVE-NEXT:    s_endpgm
9228;
9229; GFX1164_ITERATIVE-LABEL: xor_i32_varying:
9230; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
9231; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
9232; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
9233; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, 0
9234; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
9235; GFX1164_ITERATIVE-NEXT:  .LBB19_1: ; %ComputeLoop
9236; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9237; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
9238; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
9239; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
9240; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
9241; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
9242; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
9243; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
9244; GFX1164_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
9245; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
9246; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
9247; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9248; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
9249; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9250; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
9251; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
9252; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
9253; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
9254; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9255; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
9256; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB19_4
9257; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
9258; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
9259; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
9260; GFX1164_ITERATIVE-NEXT:    ds_xor_rtn_b32 v1, v1, v2
9261; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9262; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
9263; GFX1164_ITERATIVE-NEXT:  .LBB19_4:
9264; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
9265; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
9266; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
9267; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
9268; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9269; GFX1164_ITERATIVE-NEXT:    v_xor_b32_e32 v0, s2, v0
9270; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9271; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9272; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
9273; GFX1164_ITERATIVE-NEXT:    s_endpgm
9274;
9275; GFX1132_ITERATIVE-LABEL: xor_i32_varying:
9276; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
9277; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
9278; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
9279; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, 0
9280; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
9281; GFX1132_ITERATIVE-NEXT:  .LBB19_1: ; %ComputeLoop
9282; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9283; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
9284; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
9285; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
9286; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
9287; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
9288; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
9289; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
9290; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s0, s0, s3
9291; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
9292; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
9293; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9294; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
9295; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
9296; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
9297; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
9298; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
9299; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
9300; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB19_4
9301; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
9302; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
9303; GFX1132_ITERATIVE-NEXT:    ds_xor_rtn_b32 v1, v1, v2
9304; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9305; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
9306; GFX1132_ITERATIVE-NEXT:  .LBB19_4:
9307; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
9308; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
9309; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
9310; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
9311; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9312; GFX1132_ITERATIVE-NEXT:    v_xor_b32_e32 v0, s2, v0
9313; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9314; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9315; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
9316; GFX1132_ITERATIVE-NEXT:    s_endpgm
9317;
9318; GFX7LESS_DPP-LABEL: xor_i32_varying:
9319; GFX7LESS_DPP:       ; %bb.0: ; %entry
9320; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
9321; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
9322; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
9323; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9324; GFX7LESS_DPP-NEXT:    ds_xor_rtn_b32 v0, v1, v0
9325; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9326; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
9327; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
9328; GFX7LESS_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9329; GFX7LESS_DPP-NEXT:    s_endpgm
9330;
9331; GFX8_DPP-LABEL: xor_i32_varying:
9332; GFX8_DPP:       ; %bb.0: ; %entry
9333; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
9334; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
9335; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
9336; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
9337; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
9338; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
9339; GFX8_DPP-NEXT:    s_nop 0
9340; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
9341; GFX8_DPP-NEXT:    s_nop 1
9342; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
9343; GFX8_DPP-NEXT:    s_nop 1
9344; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
9345; GFX8_DPP-NEXT:    s_nop 1
9346; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
9347; GFX8_DPP-NEXT:    s_nop 1
9348; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
9349; GFX8_DPP-NEXT:    s_nop 1
9350; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
9351; GFX8_DPP-NEXT:    v_readlane_b32 s2, v1, 63
9352; GFX8_DPP-NEXT:    s_nop 0
9353; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
9354; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
9355; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
9356; GFX8_DPP-NEXT:    ; implicit-def: $vgpr0
9357; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
9358; GFX8_DPP-NEXT:    s_cbranch_execz .LBB19_2
9359; GFX8_DPP-NEXT:  ; %bb.1:
9360; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s2
9361; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
9362; GFX8_DPP-NEXT:    ds_xor_rtn_b32 v0, v3, v0
9363; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9364; GFX8_DPP-NEXT:  .LBB19_2:
9365; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
9366; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
9367; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v0
9368; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, v2
9369; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
9370; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
9371; GFX8_DPP-NEXT:    v_xor_b32_e32 v0, s4, v0
9372; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9373; GFX8_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9374; GFX8_DPP-NEXT:    s_endpgm
9375;
9376; GFX9_DPP-LABEL: xor_i32_varying:
9377; GFX9_DPP:       ; %bb.0: ; %entry
9378; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
9379; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
9380; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
9381; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
9382; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
9383; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
9384; GFX9_DPP-NEXT:    s_nop 0
9385; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
9386; GFX9_DPP-NEXT:    s_nop 1
9387; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
9388; GFX9_DPP-NEXT:    s_nop 1
9389; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
9390; GFX9_DPP-NEXT:    s_nop 1
9391; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
9392; GFX9_DPP-NEXT:    s_nop 1
9393; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
9394; GFX9_DPP-NEXT:    s_nop 1
9395; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
9396; GFX9_DPP-NEXT:    v_readlane_b32 s2, v1, 63
9397; GFX9_DPP-NEXT:    s_nop 0
9398; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
9399; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
9400; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
9401; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
9402; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
9403; GFX9_DPP-NEXT:    s_cbranch_execz .LBB19_2
9404; GFX9_DPP-NEXT:  ; %bb.1:
9405; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s2
9406; GFX9_DPP-NEXT:    ds_xor_rtn_b32 v0, v3, v0
9407; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9408; GFX9_DPP-NEXT:  .LBB19_2:
9409; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
9410; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
9411; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
9412; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v2
9413; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
9414; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
9415; GFX9_DPP-NEXT:    v_xor_b32_e32 v0, s4, v0
9416; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9417; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9418; GFX9_DPP-NEXT:    s_endpgm
9419;
9420; GFX1064_DPP-LABEL: xor_i32_varying:
9421; GFX1064_DPP:       ; %bb.0: ; %entry
9422; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
9423; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
9424; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
9425; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
9426; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
9427; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
9428; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
9429; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
9430; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
9431; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
9432; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s2
9433; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
9434; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
9435; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 15
9436; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 31
9437; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 16
9438; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
9439; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9440; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
9441; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 47
9442; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 63
9443; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s3, 32
9444; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
9445; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
9446; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
9447; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
9448; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 48
9449; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
9450; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
9451; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
9452; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
9453; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
9454; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB19_2
9455; GFX1064_DPP-NEXT:  ; %bb.1:
9456; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s6
9457; GFX1064_DPP-NEXT:    s_mov_b32 s3, s6
9458; GFX1064_DPP-NEXT:    ds_xor_rtn_b32 v0, v4, v0
9459; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9460; GFX1064_DPP-NEXT:    buffer_gl0_inv
9461; GFX1064_DPP-NEXT:  .LBB19_2:
9462; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
9463; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
9464; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
9465; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v0
9466; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
9467; GFX1064_DPP-NEXT:    v_xor_b32_e32 v0, s3, v0
9468; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
9469; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9470; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9471; GFX1064_DPP-NEXT:    s_endpgm
9472;
9473; GFX1032_DPP-LABEL: xor_i32_varying:
9474; GFX1032_DPP:       ; %bb.0: ; %entry
9475; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
9476; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
9477; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
9478; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
9479; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
9480; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
9481; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
9482; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
9483; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
9484; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
9485; GFX1032_DPP-NEXT:    v_readlane_b32 s2, v1, 31
9486; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
9487; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
9488; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9489; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
9490; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
9491; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
9492; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
9493; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
9494; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
9495; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
9496; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
9497; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
9498; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB19_2
9499; GFX1032_DPP-NEXT:  ; %bb.1:
9500; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s0
9501; GFX1032_DPP-NEXT:    ds_xor_rtn_b32 v0, v4, v0
9502; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9503; GFX1032_DPP-NEXT:    buffer_gl0_inv
9504; GFX1032_DPP-NEXT:  .LBB19_2:
9505; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
9506; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
9507; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
9508; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v0
9509; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
9510; GFX1032_DPP-NEXT:    v_xor_b32_e32 v0, s3, v0
9511; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
9512; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9513; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
9514; GFX1032_DPP-NEXT:    s_endpgm
9515;
9516; GFX1164_DPP-LABEL: xor_i32_varying:
9517; GFX1164_DPP:       ; %bb.0: ; %entry
9518; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
9519; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
9520; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
9521; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
9522; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
9523; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
9524; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
9525; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
9526; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9527; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
9528; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
9529; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9530; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
9531; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
9532; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9533; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
9534; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s2
9535; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9536; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
9537; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
9538; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 15
9539; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 31
9540; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
9541; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 16
9542; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
9543; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9544; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
9545; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 47
9546; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 63
9547; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s3, 32
9548; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
9549; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9550; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
9551; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
9552; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
9553; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 48
9554; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
9555; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
9556; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
9557; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr0
9558; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
9559; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB19_2
9560; GFX1164_DPP-NEXT:  ; %bb.1:
9561; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, s6
9562; GFX1164_DPP-NEXT:    s_mov_b32 s3, s6
9563; GFX1164_DPP-NEXT:    ds_xor_rtn_b32 v0, v4, v0
9564; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9565; GFX1164_DPP-NEXT:    buffer_gl0_inv
9566; GFX1164_DPP-NEXT:  .LBB19_2:
9567; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
9568; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
9569; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v0
9570; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, v3
9571; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9572; GFX1164_DPP-NEXT:    v_xor_b32_e32 v0, s3, v0
9573; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
9574; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9575; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
9576; GFX1164_DPP-NEXT:    s_endpgm
9577;
9578; GFX1132_DPP-LABEL: xor_i32_varying:
9579; GFX1132_DPP:       ; %bb.0: ; %entry
9580; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
9581; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
9582; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
9583; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
9584; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
9585; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
9586; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
9587; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
9588; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9589; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
9590; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
9591; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9592; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
9593; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
9594; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
9595; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
9596; GFX1132_DPP-NEXT:    v_readlane_b32 s2, v1, 31
9597; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
9598; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
9599; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9600; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
9601; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
9602; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
9603; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
9604; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
9605; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
9606; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
9607; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
9608; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
9609; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB19_2
9610; GFX1132_DPP-NEXT:  ; %bb.1:
9611; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
9612; GFX1132_DPP-NEXT:    ds_xor_rtn_b32 v0, v4, v0
9613; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9614; GFX1132_DPP-NEXT:    buffer_gl0_inv
9615; GFX1132_DPP-NEXT:  .LBB19_2:
9616; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
9617; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
9618; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
9619; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
9620; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9621; GFX1132_DPP-NEXT:    v_xor_b32_e32 v0, s3, v0
9622; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
9623; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9624; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
9625; GFX1132_DPP-NEXT:    s_endpgm
9626entry:
9627  %lane = call i32 @llvm.amdgcn.workitem.id.x()
9628  %old = atomicrmw xor ptr addrspace(3) @local_var32, i32 %lane acq_rel
9629  store i32 %old, ptr addrspace(1) %out
9630  ret void
9631}
9632
9633define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
9634; GFX7LESS_ITERATIVE-LABEL: xor_i64_varying:
9635; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
9636; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
9637; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
9638; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
9639; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
9640; GFX7LESS_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
9641; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9642; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
9643; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s8
9644; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
9645; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
9646; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
9647; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
9648; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
9649; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
9650; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[8:9], s[2:3], 0
9651; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[8:9]
9652; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
9653; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB20_1
9654; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9655; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
9656; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
9657; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
9658; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
9659; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
9660; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
9661; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB20_4
9662; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
9663; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
9664; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
9665; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
9666; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
9667; GFX7LESS_ITERATIVE-NEXT:    ds_xor_rtn_b64 v[3:4], v0, v[3:4]
9668; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9669; GFX7LESS_ITERATIVE-NEXT:  .LBB20_4:
9670; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
9671; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
9672; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
9673; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9674; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
9675; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
9676; GFX7LESS_ITERATIVE-NEXT:    v_xor_b32_e32 v2, s4, v2
9677; GFX7LESS_ITERATIVE-NEXT:    v_xor_b32_e32 v1, s5, v1
9678; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9679; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
9680; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
9681;
9682; GFX8_ITERATIVE-LABEL: xor_i64_varying:
9683; GFX8_ITERATIVE:       ; %bb.0: ; %entry
9684; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
9685; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
9686; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
9687; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
9688; GFX8_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
9689; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9690; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
9691; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
9692; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
9693; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
9694; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
9695; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
9696; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
9697; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
9698; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
9699; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
9700; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
9701; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9702; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9703; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
9704; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
9705; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
9706; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
9707; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
9708; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB20_4
9709; GFX8_ITERATIVE-NEXT:  ; %bb.3:
9710; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
9711; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
9712; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
9713; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
9714; GFX8_ITERATIVE-NEXT:    ds_xor_rtn_b64 v[3:4], v0, v[3:4]
9715; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9716; GFX8_ITERATIVE-NEXT:  .LBB20_4:
9717; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
9718; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
9719; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
9720; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
9721; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
9722; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9723; GFX8_ITERATIVE-NEXT:    v_xor_b32_e32 v2, s4, v2
9724; GFX8_ITERATIVE-NEXT:    v_xor_b32_e32 v1, s5, v1
9725; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9726; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
9727; GFX8_ITERATIVE-NEXT:    s_endpgm
9728;
9729; GFX9_ITERATIVE-LABEL: xor_i64_varying:
9730; GFX9_ITERATIVE:       ; %bb.0: ; %entry
9731; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
9732; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
9733; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
9734; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
9735; GFX9_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
9736; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9737; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
9738; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
9739; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
9740; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
9741; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
9742; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
9743; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
9744; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
9745; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
9746; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
9747; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
9748; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9749; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9750; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
9751; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
9752; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
9753; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
9754; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
9755; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB20_4
9756; GFX9_ITERATIVE-NEXT:  ; %bb.3:
9757; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
9758; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
9759; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
9760; GFX9_ITERATIVE-NEXT:    ds_xor_rtn_b64 v[3:4], v0, v[3:4]
9761; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9762; GFX9_ITERATIVE-NEXT:  .LBB20_4:
9763; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
9764; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
9765; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
9766; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
9767; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
9768; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9769; GFX9_ITERATIVE-NEXT:    v_xor_b32_e32 v2, s4, v2
9770; GFX9_ITERATIVE-NEXT:    v_xor_b32_e32 v1, s5, v1
9771; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9772; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
9773; GFX9_ITERATIVE-NEXT:    s_endpgm
9774;
9775; GFX1064_ITERATIVE-LABEL: xor_i64_varying:
9776; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
9777; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
9778; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
9779; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
9780; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
9781; GFX1064_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
9782; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9783; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
9784; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
9785; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
9786; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
9787; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
9788; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
9789; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
9790; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
9791; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
9792; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
9793; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9794; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9795; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
9796; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
9797; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
9798; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
9799; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
9800; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB20_4
9801; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
9802; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
9803; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
9804; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
9805; GFX1064_ITERATIVE-NEXT:    ds_xor_rtn_b64 v[3:4], v0, v[3:4]
9806; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9807; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
9808; GFX1064_ITERATIVE-NEXT:  .LBB20_4:
9809; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
9810; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
9811; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
9812; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v4
9813; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
9814; GFX1064_ITERATIVE-NEXT:    v_xor_b32_e32 v2, s2, v2
9815; GFX1064_ITERATIVE-NEXT:    v_xor_b32_e32 v1, s3, v1
9816; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
9817; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9818; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9819; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
9820; GFX1064_ITERATIVE-NEXT:    s_endpgm
9821;
9822; GFX1032_ITERATIVE-LABEL: xor_i64_varying:
9823; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
9824; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
9825; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
9826; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
9827; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
9828; GFX1032_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
9829; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9830; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
9831; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
9832; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
9833; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
9834; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
9835; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
9836; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
9837; GFX1032_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
9838; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
9839; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
9840; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9841; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9842; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
9843; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
9844; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
9845; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
9846; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB20_4
9847; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
9848; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
9849; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
9850; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
9851; GFX1032_ITERATIVE-NEXT:    ds_xor_rtn_b64 v[3:4], v0, v[3:4]
9852; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9853; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
9854; GFX1032_ITERATIVE-NEXT:  .LBB20_4:
9855; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
9856; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
9857; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
9858; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v4
9859; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
9860; GFX1032_ITERATIVE-NEXT:    v_xor_b32_e32 v2, s2, v2
9861; GFX1032_ITERATIVE-NEXT:    v_xor_b32_e32 v1, s3, v1
9862; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
9863; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9864; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9865; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
9866; GFX1032_ITERATIVE-NEXT:    s_endpgm
9867;
9868; GFX1164_ITERATIVE-LABEL: xor_i64_varying:
9869; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
9870; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
9871; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
9872; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
9873; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
9874; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
9875; GFX1164_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
9876; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9877; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
9878; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
9879; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
9880; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
9881; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
9882; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
9883; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
9884; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
9885; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
9886; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
9887; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
9888; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9889; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
9890; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9891; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
9892; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
9893; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
9894; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
9895; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9896; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
9897; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB20_4
9898; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
9899; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s1
9900; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
9901; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
9902; GFX1164_ITERATIVE-NEXT:    ds_xor_rtn_b64 v[2:3], v4, v[2:3]
9903; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9904; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
9905; GFX1164_ITERATIVE-NEXT:  .LBB20_4:
9906; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
9907; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
9908; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
9909; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v2
9910; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
9911; GFX1164_ITERATIVE-NEXT:    v_xor_b32_e32 v1, s2, v1
9912; GFX1164_ITERATIVE-NEXT:    v_xor_b32_e32 v0, s3, v0
9913; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
9914; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9915; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9916; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
9917; GFX1164_ITERATIVE-NEXT:    s_endpgm
9918;
9919; GFX1132_ITERATIVE-LABEL: xor_i64_varying:
9920; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
9921; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
9922; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
9923; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
9924; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
9925; GFX1132_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
9926; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
9927; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
9928; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
9929; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
9930; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
9931; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
9932; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
9933; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
9934; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
9935; GFX1132_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
9936; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
9937; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
9938; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
9939; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
9940; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
9941; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
9942; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
9943; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
9944; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
9945; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB20_4
9946; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
9947; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1
9948; GFX1132_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
9949; GFX1132_ITERATIVE-NEXT:    ds_xor_rtn_b64 v[2:3], v4, v[2:3]
9950; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9951; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
9952; GFX1132_ITERATIVE-NEXT:  .LBB20_4:
9953; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
9954; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
9955; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
9956; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v2
9957; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
9958; GFX1132_ITERATIVE-NEXT:    v_xor_b32_e32 v1, s2, v1
9959; GFX1132_ITERATIVE-NEXT:    v_xor_b32_e32 v0, s3, v0
9960; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
9961; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
9962; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
9963; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
9964; GFX1132_ITERATIVE-NEXT:    s_endpgm
9965;
9966; GFX7LESS_DPP-LABEL: xor_i64_varying:
9967; GFX7LESS_DPP:       ; %bb.0: ; %entry
9968; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
9969; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
9970; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
9971; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9972; GFX7LESS_DPP-NEXT:    ds_xor_rtn_b64 v[0:1], v1, v[0:1]
9973; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
9974; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
9975; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
9976; GFX7LESS_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
9977; GFX7LESS_DPP-NEXT:    s_endpgm
9978;
9979; GFX8_DPP-LABEL: xor_i64_varying:
9980; GFX8_DPP:       ; %bb.0: ; %entry
9981; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
9982; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
9983; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
9984; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
9985; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
9986; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
9987; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
9988; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
9989; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
9990; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
9991; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
9992; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
9993; GFX8_DPP-NEXT:    s_nop 0
9994; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
9995; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
9996; GFX8_DPP-NEXT:    s_nop 0
9997; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
9998; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
9999; GFX8_DPP-NEXT:    s_nop 0
10000; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
10001; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
10002; GFX8_DPP-NEXT:    s_nop 0
10003; GFX8_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
10004; GFX8_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
10005; GFX8_DPP-NEXT:    v_readlane_b32 s3, v1, 63
10006; GFX8_DPP-NEXT:    v_readlane_b32 s2, v2, 63
10007; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
10008; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
10009; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10010; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
10011; GFX8_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
10012; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
10013; GFX8_DPP-NEXT:    s_cbranch_execz .LBB20_2
10014; GFX8_DPP-NEXT:  ; %bb.1:
10015; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s3
10016; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, s2
10017; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
10018; GFX8_DPP-NEXT:    ds_xor_rtn_b64 v[5:6], v7, v[5:6]
10019; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10020; GFX8_DPP-NEXT:  .LBB20_2:
10021; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
10022; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
10023; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v6
10024; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v5
10025; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
10026; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
10027; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
10028; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
10029; GFX8_DPP-NEXT:    v_xor_b32_e32 v6, s4, v6
10030; GFX8_DPP-NEXT:    v_xor_b32_e32 v5, s5, v5
10031; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10032; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
10033; GFX8_DPP-NEXT:    s_endpgm
10034;
10035; GFX9_DPP-LABEL: xor_i64_varying:
10036; GFX9_DPP:       ; %bb.0: ; %entry
10037; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
10038; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
10039; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
10040; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
10041; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
10042; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
10043; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
10044; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
10045; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
10046; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
10047; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
10048; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
10049; GFX9_DPP-NEXT:    s_nop 0
10050; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
10051; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
10052; GFX9_DPP-NEXT:    s_nop 0
10053; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
10054; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
10055; GFX9_DPP-NEXT:    s_nop 0
10056; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
10057; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
10058; GFX9_DPP-NEXT:    s_nop 0
10059; GFX9_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
10060; GFX9_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
10061; GFX9_DPP-NEXT:    v_readlane_b32 s3, v1, 63
10062; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
10063; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
10064; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
10065; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10066; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
10067; GFX9_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
10068; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
10069; GFX9_DPP-NEXT:    s_cbranch_execz .LBB20_2
10070; GFX9_DPP-NEXT:  ; %bb.1:
10071; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s3
10072; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, s2
10073; GFX9_DPP-NEXT:    ds_xor_rtn_b64 v[5:6], v7, v[5:6]
10074; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10075; GFX9_DPP-NEXT:  .LBB20_2:
10076; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
10077; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
10078; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v6
10079; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v5
10080; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
10081; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
10082; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
10083; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
10084; GFX9_DPP-NEXT:    v_xor_b32_e32 v6, s4, v6
10085; GFX9_DPP-NEXT:    v_xor_b32_e32 v5, s5, v5
10086; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10087; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
10088; GFX9_DPP-NEXT:    s_endpgm
10089;
10090; GFX1064_DPP-LABEL: xor_i64_varying:
10091; GFX1064_DPP:       ; %bb.0: ; %entry
10092; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
10093; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
10094; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
10095; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
10096; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
10097; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
10098; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
10099; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
10100; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
10101; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
10102; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
10103; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
10104; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
10105; GFX1064_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
10106; GFX1064_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
10107; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
10108; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
10109; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
10110; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 31
10111; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, s2
10112; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, s3
10113; GFX1064_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
10114; GFX1064_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
10115; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10116; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10117; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
10118; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
10119; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 15
10120; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
10121; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 15
10122; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 31
10123; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v2, 31
10124; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s2, 16
10125; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 63
10126; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s3, 16
10127; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
10128; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 63
10129; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v2, 47
10130; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s6, 32
10131; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s7, 32
10132; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10133; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
10134; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
10135; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
10136; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
10137; GFX1064_DPP-NEXT:    v_writelane_b32 v6, s8, 48
10138; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s9, 48
10139; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
10140; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
10141; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
10142; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
10143; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
10144; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB20_2
10145; GFX1064_DPP-NEXT:  ; %bb.1:
10146; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
10147; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
10148; GFX1064_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
10149; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10150; GFX1064_DPP-NEXT:    buffer_gl0_inv
10151; GFX1064_DPP-NEXT:  .LBB20_2:
10152; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
10153; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
10154; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
10155; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v8
10156; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
10157; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v6
10158; GFX1064_DPP-NEXT:    s_mov_b32 null, 0
10159; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s4, v7
10160; GFX1064_DPP-NEXT:    v_xor_b32_e32 v9, s3, v9
10161; GFX1064_DPP-NEXT:    v_xor_b32_e32 v8, s4, v8
10162; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
10163; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10164; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0
10165; GFX1064_DPP-NEXT:    s_endpgm
10166;
10167; GFX1032_DPP-LABEL: xor_i64_varying:
10168; GFX1032_DPP:       ; %bb.0: ; %entry
10169; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
10170; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s2
10171; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s2
10172; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
10173; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
10174; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
10175; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
10176; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
10177; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
10178; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
10179; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
10180; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
10181; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
10182; GFX1032_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
10183; GFX1032_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
10184; GFX1032_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
10185; GFX1032_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
10186; GFX1032_DPP-NEXT:    v_readlane_b32 s3, v1, 15
10187; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 31
10188; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v2, 31
10189; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
10190; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
10191; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v2, 15
10192; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
10193; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
10194; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
10195; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
10196; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s3, 16
10197; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s6, 16
10198; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
10199; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
10200; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
10201; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
10202; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
10203; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB20_2
10204; GFX1032_DPP-NEXT:  ; %bb.1:
10205; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, s1
10206; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, s0
10207; GFX1032_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
10208; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10209; GFX1032_DPP-NEXT:    buffer_gl0_inv
10210; GFX1032_DPP-NEXT:  .LBB20_2:
10211; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
10212; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
10213; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
10214; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v8
10215; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
10216; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v6
10217; GFX1032_DPP-NEXT:    s_mov_b32 null, 0
10218; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s4, v7
10219; GFX1032_DPP-NEXT:    v_xor_b32_e32 v9, s3, v9
10220; GFX1032_DPP-NEXT:    v_xor_b32_e32 v8, s4, v8
10221; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
10222; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10223; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0
10224; GFX1032_DPP-NEXT:    s_endpgm
10225;
10226; GFX1164_DPP-LABEL: xor_i64_varying:
10227; GFX1164_DPP:       ; %bb.0: ; %entry
10228; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
10229; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
10230; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
10231; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
10232; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
10233; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, 0
10234; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
10235; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
10236; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
10237; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
10238; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10239; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
10240; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
10241; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10242; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
10243; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
10244; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10245; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
10246; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
10247; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10248; GFX1164_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
10249; GFX1164_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
10250; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10251; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
10252; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
10253; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10254; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
10255; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v2, 31
10256; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10257; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, s2
10258; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s3
10259; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10260; GFX1164_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
10261; GFX1164_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
10262; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
10263; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 15
10264; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
10265; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10266; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10267; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
10268; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
10269; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v2, 15
10270; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 31
10271; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s2, 16
10272; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v2, 31
10273; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 63
10274; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s3, 16
10275; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v1, 47
10276; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 63
10277; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s6, 32
10278; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v2, 47
10279; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s7, 32
10280; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10281; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10282; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
10283; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
10284; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
10285; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
10286; GFX1164_DPP-NEXT:    v_writelane_b32 v6, s8, 48
10287; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s9, 48
10288; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
10289; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
10290; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
10291; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
10292; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
10293; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB20_2
10294; GFX1164_DPP-NEXT:  ; %bb.1:
10295; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
10296; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
10297; GFX1164_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
10298; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10299; GFX1164_DPP-NEXT:    buffer_gl0_inv
10300; GFX1164_DPP-NEXT:  .LBB20_2:
10301; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
10302; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
10303; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v8
10304; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
10305; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, v6
10306; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s4, v7
10307; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10308; GFX1164_DPP-NEXT:    v_xor_b32_e32 v9, s3, v9
10309; GFX1164_DPP-NEXT:    v_xor_b32_e32 v8, s4, v8
10310; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
10311; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10312; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
10313; GFX1164_DPP-NEXT:    s_endpgm
10314;
10315; GFX1132_DPP-LABEL: xor_i64_varying:
10316; GFX1132_DPP:       ; %bb.0: ; %entry
10317; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
10318; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
10319; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
10320; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s2
10321; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s2
10322; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0
10323; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
10324; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
10325; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
10326; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10327; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
10328; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
10329; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10330; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
10331; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
10332; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10333; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
10334; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
10335; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10336; GFX1132_DPP-NEXT:    v_permlanex16_b32 v3, v1, -1, -1
10337; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v2, -1, -1
10338; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10339; GFX1132_DPP-NEXT:    v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
10340; GFX1132_DPP-NEXT:    v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
10341; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
10342; GFX1132_DPP-NEXT:    v_readlane_b32 s3, v1, 15
10343; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 31
10344; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v2, 31
10345; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
10346; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
10347; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v2, 15
10348; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
10349; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10350; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
10351; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
10352; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
10353; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s3, 16
10354; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
10355; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
10356; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
10357; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
10358; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
10359; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
10360; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB20_2
10361; GFX1132_DPP-NEXT:  ; %bb.1:
10362; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
10363; GFX1132_DPP-NEXT:    ds_xor_rtn_b64 v[7:8], v0, v[7:8]
10364; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10365; GFX1132_DPP-NEXT:    buffer_gl0_inv
10366; GFX1132_DPP-NEXT:  .LBB20_2:
10367; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
10368; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
10369; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v8
10370; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
10371; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v6
10372; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
10373; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10374; GFX1132_DPP-NEXT:    v_xor_b32_e32 v9, s3, v9
10375; GFX1132_DPP-NEXT:    v_xor_b32_e32 v8, s4, v8
10376; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
10377; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10378; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
10379; GFX1132_DPP-NEXT:    s_endpgm
10380entry:
10381  %lane = call i32 @llvm.amdgcn.workitem.id.x()
10382  %lane_ext = zext i32 %lane to i64
10383  %old = atomicrmw xor ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
10384  store i64 %old, ptr addrspace(1) %out
10385  ret void
10386}
10387
10388define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
10389; GFX7LESS_ITERATIVE-LABEL: max_i32_varying:
10390; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
10391; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
10392; GFX7LESS_ITERATIVE-NEXT:    s_brev_b32 s2, 1
10393; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
10394; GFX7LESS_ITERATIVE-NEXT:  .LBB21_1: ; %ComputeLoop
10395; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
10396; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
10397; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s3
10398; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
10399; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
10400; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
10401; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
10402; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
10403; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
10404; GFX7LESS_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
10405; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB21_1
10406; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
10407; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
10408; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
10409; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
10410; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
10411; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
10412; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
10413; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB21_4
10414; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
10415; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
10416; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
10417; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
10418; GFX7LESS_ITERATIVE-NEXT:    ds_max_rtn_i32 v0, v0, v2
10419; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10420; GFX7LESS_ITERATIVE-NEXT:  .LBB21_4:
10421; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
10422; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
10423; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
10424; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
10425; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
10426; GFX7LESS_ITERATIVE-NEXT:    v_max_i32_e32 v0, s4, v1
10427; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10428; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10429; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
10430;
10431; GFX8_ITERATIVE-LABEL: max_i32_varying:
10432; GFX8_ITERATIVE:       ; %bb.0: ; %entry
10433; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
10434; GFX8_ITERATIVE-NEXT:    s_brev_b32 s2, 1
10435; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
10436; GFX8_ITERATIVE-NEXT:  .LBB21_1: ; %ComputeLoop
10437; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
10438; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
10439; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
10440; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
10441; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
10442; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
10443; GFX8_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
10444; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
10445; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
10446; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
10447; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
10448; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10449; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
10450; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
10451; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
10452; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
10453; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
10454; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB21_4
10455; GFX8_ITERATIVE-NEXT:  ; %bb.3:
10456; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
10457; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
10458; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
10459; GFX8_ITERATIVE-NEXT:    ds_max_rtn_i32 v0, v0, v2
10460; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10461; GFX8_ITERATIVE-NEXT:  .LBB21_4:
10462; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
10463; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
10464; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
10465; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
10466; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
10467; GFX8_ITERATIVE-NEXT:    v_max_i32_e32 v0, s4, v1
10468; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10469; GFX8_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10470; GFX8_ITERATIVE-NEXT:    s_endpgm
10471;
10472; GFX9_ITERATIVE-LABEL: max_i32_varying:
10473; GFX9_ITERATIVE:       ; %bb.0: ; %entry
10474; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
10475; GFX9_ITERATIVE-NEXT:    s_brev_b32 s2, 1
10476; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
10477; GFX9_ITERATIVE-NEXT:  .LBB21_1: ; %ComputeLoop
10478; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
10479; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
10480; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
10481; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
10482; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
10483; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
10484; GFX9_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
10485; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
10486; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
10487; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
10488; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
10489; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10490; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
10491; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
10492; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
10493; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
10494; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
10495; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB21_4
10496; GFX9_ITERATIVE-NEXT:  ; %bb.3:
10497; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
10498; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
10499; GFX9_ITERATIVE-NEXT:    ds_max_rtn_i32 v0, v0, v2
10500; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10501; GFX9_ITERATIVE-NEXT:  .LBB21_4:
10502; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
10503; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
10504; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
10505; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
10506; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
10507; GFX9_ITERATIVE-NEXT:    v_max_i32_e32 v0, s4, v1
10508; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10509; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10510; GFX9_ITERATIVE-NEXT:    s_endpgm
10511;
10512; GFX1064_ITERATIVE-LABEL: max_i32_varying:
10513; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
10514; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
10515; GFX1064_ITERATIVE-NEXT:    s_brev_b32 s2, 1
10516; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
10517; GFX1064_ITERATIVE-NEXT:  .LBB21_1: ; %ComputeLoop
10518; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
10519; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
10520; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
10521; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
10522; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
10523; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
10524; GFX1064_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
10525; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
10526; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
10527; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
10528; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10529; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
10530; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
10531; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
10532; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
10533; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
10534; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB21_4
10535; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
10536; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
10537; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
10538; GFX1064_ITERATIVE-NEXT:    ds_max_rtn_i32 v0, v0, v2
10539; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10540; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
10541; GFX1064_ITERATIVE-NEXT:  .LBB21_4:
10542; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
10543; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
10544; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
10545; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
10546; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
10547; GFX1064_ITERATIVE-NEXT:    v_max_i32_e32 v0, s2, v1
10548; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
10549; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10550; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10551; GFX1064_ITERATIVE-NEXT:    s_endpgm
10552;
10553; GFX1032_ITERATIVE-LABEL: max_i32_varying:
10554; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
10555; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
10556; GFX1032_ITERATIVE-NEXT:    s_brev_b32 s0, 1
10557; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
10558; GFX1032_ITERATIVE-NEXT:  .LBB21_1: ; %ComputeLoop
10559; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
10560; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
10561; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
10562; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
10563; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
10564; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
10565; GFX1032_ITERATIVE-NEXT:    s_max_i32 s0, s0, s3
10566; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
10567; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
10568; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
10569; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10570; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
10571; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
10572; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
10573; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
10574; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB21_4
10575; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
10576; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
10577; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
10578; GFX1032_ITERATIVE-NEXT:    ds_max_rtn_i32 v0, v0, v2
10579; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10580; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
10581; GFX1032_ITERATIVE-NEXT:  .LBB21_4:
10582; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
10583; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
10584; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
10585; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
10586; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
10587; GFX1032_ITERATIVE-NEXT:    v_max_i32_e32 v0, s2, v1
10588; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
10589; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10590; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10591; GFX1032_ITERATIVE-NEXT:    s_endpgm
10592;
10593; GFX1164_ITERATIVE-LABEL: max_i32_varying:
10594; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
10595; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
10596; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
10597; GFX1164_ITERATIVE-NEXT:    s_brev_b32 s2, 1
10598; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
10599; GFX1164_ITERATIVE-NEXT:  .LBB21_1: ; %ComputeLoop
10600; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
10601; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
10602; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
10603; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
10604; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
10605; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
10606; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
10607; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
10608; GFX1164_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
10609; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
10610; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
10611; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
10612; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
10613; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10614; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
10615; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
10616; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
10617; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
10618; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10619; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
10620; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB21_4
10621; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
10622; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
10623; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
10624; GFX1164_ITERATIVE-NEXT:    ds_max_rtn_i32 v1, v1, v2
10625; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10626; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
10627; GFX1164_ITERATIVE-NEXT:  .LBB21_4:
10628; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
10629; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
10630; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
10631; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
10632; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10633; GFX1164_ITERATIVE-NEXT:    v_max_i32_e32 v0, s2, v0
10634; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
10635; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10636; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
10637; GFX1164_ITERATIVE-NEXT:    s_endpgm
10638;
10639; GFX1132_ITERATIVE-LABEL: max_i32_varying:
10640; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
10641; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
10642; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
10643; GFX1132_ITERATIVE-NEXT:    s_brev_b32 s0, 1
10644; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
10645; GFX1132_ITERATIVE-NEXT:  .LBB21_1: ; %ComputeLoop
10646; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
10647; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
10648; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
10649; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
10650; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
10651; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
10652; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
10653; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
10654; GFX1132_ITERATIVE-NEXT:    s_max_i32 s0, s0, s3
10655; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
10656; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
10657; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
10658; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
10659; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
10660; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
10661; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
10662; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
10663; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
10664; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB21_4
10665; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
10666; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
10667; GFX1132_ITERATIVE-NEXT:    ds_max_rtn_i32 v1, v1, v2
10668; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10669; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
10670; GFX1132_ITERATIVE-NEXT:  .LBB21_4:
10671; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
10672; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
10673; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
10674; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
10675; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10676; GFX1132_ITERATIVE-NEXT:    v_max_i32_e32 v0, s2, v0
10677; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
10678; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
10679; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
10680; GFX1132_ITERATIVE-NEXT:    s_endpgm
10681;
10682; GFX7LESS_DPP-LABEL: max_i32_varying:
10683; GFX7LESS_DPP:       ; %bb.0: ; %entry
10684; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
10685; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
10686; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
10687; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10688; GFX7LESS_DPP-NEXT:    ds_max_rtn_i32 v0, v1, v0
10689; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10690; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
10691; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
10692; GFX7LESS_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10693; GFX7LESS_DPP-NEXT:    s_endpgm
10694;
10695; GFX8_DPP-LABEL: max_i32_varying:
10696; GFX8_DPP:       ; %bb.0: ; %entry
10697; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
10698; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
10699; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
10700; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
10701; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
10702; GFX8_DPP-NEXT:    s_nop 1
10703; GFX8_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
10704; GFX8_DPP-NEXT:    s_nop 1
10705; GFX8_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
10706; GFX8_DPP-NEXT:    s_nop 1
10707; GFX8_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
10708; GFX8_DPP-NEXT:    s_nop 1
10709; GFX8_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
10710; GFX8_DPP-NEXT:    s_nop 1
10711; GFX8_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
10712; GFX8_DPP-NEXT:    s_nop 1
10713; GFX8_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
10714; GFX8_DPP-NEXT:    v_readlane_b32 s2, v2, 63
10715; GFX8_DPP-NEXT:    s_nop 0
10716; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
10717; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10718; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
10719; GFX8_DPP-NEXT:    ; implicit-def: $vgpr0
10720; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
10721; GFX8_DPP-NEXT:    s_cbranch_execz .LBB21_2
10722; GFX8_DPP-NEXT:  ; %bb.1:
10723; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, 0
10724; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, s2
10725; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
10726; GFX8_DPP-NEXT:    ds_max_rtn_i32 v0, v0, v3
10727; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10728; GFX8_DPP-NEXT:  .LBB21_2:
10729; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
10730; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
10731; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v0
10732; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, v1
10733; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
10734; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
10735; GFX8_DPP-NEXT:    v_max_i32_e32 v0, s4, v0
10736; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10737; GFX8_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10738; GFX8_DPP-NEXT:    s_endpgm
10739;
10740; GFX9_DPP-LABEL: max_i32_varying:
10741; GFX9_DPP:       ; %bb.0: ; %entry
10742; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
10743; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
10744; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
10745; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
10746; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
10747; GFX9_DPP-NEXT:    s_nop 1
10748; GFX9_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
10749; GFX9_DPP-NEXT:    s_nop 1
10750; GFX9_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
10751; GFX9_DPP-NEXT:    s_nop 1
10752; GFX9_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
10753; GFX9_DPP-NEXT:    s_nop 1
10754; GFX9_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
10755; GFX9_DPP-NEXT:    s_nop 1
10756; GFX9_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
10757; GFX9_DPP-NEXT:    s_nop 1
10758; GFX9_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
10759; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
10760; GFX9_DPP-NEXT:    s_nop 0
10761; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
10762; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10763; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
10764; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
10765; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
10766; GFX9_DPP-NEXT:    s_cbranch_execz .LBB21_2
10767; GFX9_DPP-NEXT:  ; %bb.1:
10768; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, 0
10769; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, s2
10770; GFX9_DPP-NEXT:    ds_max_rtn_i32 v0, v0, v3
10771; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10772; GFX9_DPP-NEXT:  .LBB21_2:
10773; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
10774; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
10775; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
10776; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v1
10777; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
10778; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
10779; GFX9_DPP-NEXT:    v_max_i32_e32 v0, s4, v0
10780; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10781; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10782; GFX9_DPP-NEXT:    s_endpgm
10783;
10784; GFX1064_DPP-LABEL: max_i32_varying:
10785; GFX1064_DPP:       ; %bb.0: ; %entry
10786; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
10787; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
10788; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v3, 1
10789; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
10790; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
10791; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
10792; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
10793; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
10794; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
10795; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
10796; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s2
10797; GFX1064_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
10798; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
10799; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 15
10800; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 31
10801; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 16
10802; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10803; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10804; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
10805; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 47
10806; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 63
10807; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s3, 32
10808; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10809; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
10810; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
10811; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 48
10812; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10813; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
10814; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
10815; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
10816; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
10817; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB21_2
10818; GFX1064_DPP-NEXT:  ; %bb.1:
10819; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
10820; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, s6
10821; GFX1064_DPP-NEXT:    s_mov_b32 s3, s6
10822; GFX1064_DPP-NEXT:    ds_max_rtn_i32 v0, v0, v4
10823; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10824; GFX1064_DPP-NEXT:    buffer_gl0_inv
10825; GFX1064_DPP-NEXT:  .LBB21_2:
10826; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
10827; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
10828; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
10829; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v0
10830; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
10831; GFX1064_DPP-NEXT:    v_max_i32_e32 v0, s3, v0
10832; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
10833; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10834; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10835; GFX1064_DPP-NEXT:    s_endpgm
10836;
10837; GFX1032_DPP-LABEL: max_i32_varying:
10838; GFX1032_DPP:       ; %bb.0: ; %entry
10839; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
10840; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s0
10841; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v3, 1
10842; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
10843; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
10844; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
10845; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
10846; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
10847; GFX1032_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
10848; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
10849; GFX1032_DPP-NEXT:    v_readlane_b32 s2, v1, 31
10850; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
10851; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
10852; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10853; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
10854; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
10855; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
10856; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
10857; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
10858; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
10859; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
10860; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
10861; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB21_2
10862; GFX1032_DPP-NEXT:  ; %bb.1:
10863; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
10864; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, s0
10865; GFX1032_DPP-NEXT:    ds_max_rtn_i32 v0, v0, v4
10866; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10867; GFX1032_DPP-NEXT:    buffer_gl0_inv
10868; GFX1032_DPP-NEXT:  .LBB21_2:
10869; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
10870; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
10871; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
10872; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v0
10873; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
10874; GFX1032_DPP-NEXT:    v_max_i32_e32 v0, s3, v0
10875; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
10876; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10877; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
10878; GFX1032_DPP-NEXT:    s_endpgm
10879;
10880; GFX1164_DPP-LABEL: max_i32_varying:
10881; GFX1164_DPP:       ; %bb.0: ; %entry
10882; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
10883; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
10884; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
10885; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
10886; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v3, 1
10887; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
10888; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
10889; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
10890; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10891; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
10892; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
10893; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10894; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
10895; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
10896; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10897; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
10898; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s2
10899; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10900; GFX1164_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
10901; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
10902; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 15
10903; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 31
10904; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
10905; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 16
10906; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10907; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10908; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
10909; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 47
10910; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 63
10911; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s3, 32
10912; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10913; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
10914; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
10915; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
10916; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 48
10917; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
10918; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
10919; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
10920; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr0
10921; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
10922; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB21_2
10923; GFX1164_DPP-NEXT:  ; %bb.1:
10924; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
10925; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s6
10926; GFX1164_DPP-NEXT:    s_mov_b32 s3, s6
10927; GFX1164_DPP-NEXT:    ds_max_rtn_i32 v0, v0, v4
10928; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10929; GFX1164_DPP-NEXT:    buffer_gl0_inv
10930; GFX1164_DPP-NEXT:  .LBB21_2:
10931; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
10932; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
10933; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v0
10934; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, v3
10935; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10936; GFX1164_DPP-NEXT:    v_max_i32_e32 v0, s3, v0
10937; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
10938; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10939; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
10940; GFX1164_DPP-NEXT:    s_endpgm
10941;
10942; GFX1132_DPP-LABEL: max_i32_varying:
10943; GFX1132_DPP:       ; %bb.0: ; %entry
10944; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
10945; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
10946; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
10947; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v0, s0
10948; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v3, 1
10949; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
10950; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
10951; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
10952; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10953; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
10954; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
10955; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10956; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
10957; GFX1132_DPP-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
10958; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
10959; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
10960; GFX1132_DPP-NEXT:    v_readlane_b32 s2, v1, 31
10961; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
10962; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
10963; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10964; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
10965; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
10966; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
10967; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
10968; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
10969; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
10970; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
10971; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
10972; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB21_2
10973; GFX1132_DPP-NEXT:  ; %bb.1:
10974; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
10975; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, s0
10976; GFX1132_DPP-NEXT:    ds_max_rtn_i32 v0, v0, v4
10977; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10978; GFX1132_DPP-NEXT:    buffer_gl0_inv
10979; GFX1132_DPP-NEXT:  .LBB21_2:
10980; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
10981; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
10982; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
10983; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
10984; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10985; GFX1132_DPP-NEXT:    v_max_i32_e32 v0, s3, v0
10986; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
10987; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
10988; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
10989; GFX1132_DPP-NEXT:    s_endpgm
10990entry:
10991  %lane = call i32 @llvm.amdgcn.workitem.id.x()
10992  %old = atomicrmw max ptr addrspace(3) @local_var32, i32 %lane acq_rel
10993  store i32 %old, ptr addrspace(1) %out
10994  ret void
10995}
10996
10997define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
10998; GFX7LESS-LABEL: max_i64_constant:
10999; GFX7LESS:       ; %bb.0: ; %entry
11000; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
11001; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
11002; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
11003; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
11004; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
11005; GFX7LESS-NEXT:    s_cbranch_execz .LBB22_2
11006; GFX7LESS-NEXT:  ; %bb.1:
11007; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
11008; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
11009; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
11010; GFX7LESS-NEXT:    s_mov_b32 m0, -1
11011; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
11012; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
11013; GFX7LESS-NEXT:  .LBB22_2:
11014; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
11015; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
11016; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
11017; GFX7LESS-NEXT:    s_mov_b32 s2, -1
11018; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
11019; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
11020; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
11021; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
11022; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
11023; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
11024; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
11025; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
11026; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
11027; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
11028; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
11029; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
11030; GFX7LESS-NEXT:    s_endpgm
11031;
11032; GFX8-LABEL: max_i64_constant:
11033; GFX8:       ; %bb.0: ; %entry
11034; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11035; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
11036; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
11037; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
11038; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
11039; GFX8-NEXT:    s_cbranch_execz .LBB22_2
11040; GFX8-NEXT:  ; %bb.1:
11041; GFX8-NEXT:    v_mov_b32_e32 v0, 5
11042; GFX8-NEXT:    v_mov_b32_e32 v1, 0
11043; GFX8-NEXT:    v_mov_b32_e32 v2, 0
11044; GFX8-NEXT:    s_mov_b32 m0, -1
11045; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
11046; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
11047; GFX8-NEXT:  .LBB22_2:
11048; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
11049; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
11050; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
11051; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
11052; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
11053; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
11054; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
11055; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
11056; GFX8-NEXT:    v_mov_b32_e32 v2, s5
11057; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
11058; GFX8-NEXT:    v_mov_b32_e32 v2, s4
11059; GFX8-NEXT:    s_mov_b32 s3, 0xf000
11060; GFX8-NEXT:    s_mov_b32 s2, -1
11061; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
11062; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
11063; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
11064; GFX8-NEXT:    s_endpgm
11065;
11066; GFX9-LABEL: max_i64_constant:
11067; GFX9:       ; %bb.0: ; %entry
11068; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11069; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
11070; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
11071; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
11072; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
11073; GFX9-NEXT:    s_cbranch_execz .LBB22_2
11074; GFX9-NEXT:  ; %bb.1:
11075; GFX9-NEXT:    v_mov_b32_e32 v0, 5
11076; GFX9-NEXT:    v_mov_b32_e32 v1, 0
11077; GFX9-NEXT:    v_mov_b32_e32 v2, 0
11078; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
11079; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
11080; GFX9-NEXT:  .LBB22_2:
11081; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
11082; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
11083; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
11084; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
11085; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
11086; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
11087; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
11088; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
11089; GFX9-NEXT:    v_mov_b32_e32 v2, s5
11090; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
11091; GFX9-NEXT:    v_mov_b32_e32 v2, s4
11092; GFX9-NEXT:    s_mov_b32 s3, 0xf000
11093; GFX9-NEXT:    s_mov_b32 s2, -1
11094; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
11095; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
11096; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
11097; GFX9-NEXT:    s_endpgm
11098;
11099; GFX1064-LABEL: max_i64_constant:
11100; GFX1064:       ; %bb.0: ; %entry
11101; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11102; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
11103; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
11104; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
11105; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
11106; GFX1064-NEXT:    s_cbranch_execz .LBB22_2
11107; GFX1064-NEXT:  ; %bb.1:
11108; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
11109; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
11110; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
11111; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
11112; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
11113; GFX1064-NEXT:    buffer_gl0_inv
11114; GFX1064-NEXT:  .LBB22_2:
11115; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
11116; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
11117; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
11118; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
11119; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
11120; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
11121; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
11122; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
11123; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
11124; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
11125; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
11126; GFX1064-NEXT:    s_mov_b32 s2, -1
11127; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
11128; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
11129; GFX1064-NEXT:    s_endpgm
11130;
11131; GFX1032-LABEL: max_i64_constant:
11132; GFX1032:       ; %bb.0: ; %entry
11133; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11134; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
11135; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
11136; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
11137; GFX1032-NEXT:    s_cbranch_execz .LBB22_2
11138; GFX1032-NEXT:  ; %bb.1:
11139; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
11140; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
11141; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
11142; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
11143; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
11144; GFX1032-NEXT:    buffer_gl0_inv
11145; GFX1032-NEXT:  .LBB22_2:
11146; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
11147; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11148; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
11149; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
11150; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
11151; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
11152; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
11153; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
11154; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
11155; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
11156; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
11157; GFX1032-NEXT:    s_mov_b32 s2, -1
11158; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
11159; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
11160; GFX1032-NEXT:    s_endpgm
11161;
11162; GFX1164-LABEL: max_i64_constant:
11163; GFX1164:       ; %bb.0: ; %entry
11164; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11165; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11166; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
11167; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
11168; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
11169; GFX1164-NEXT:    s_and_saveexec_b64 s[0:1], vcc
11170; GFX1164-NEXT:    s_cbranch_execz .LBB22_2
11171; GFX1164-NEXT:  ; %bb.1:
11172; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
11173; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
11174; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
11175; GFX1164-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
11176; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
11177; GFX1164-NEXT:    buffer_gl0_inv
11178; GFX1164-NEXT:  .LBB22_2:
11179; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
11180; GFX1164-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
11181; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
11182; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
11183; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
11184; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
11185; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11186; GFX1164-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
11187; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
11188; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
11189; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
11190; GFX1164-NEXT:    s_mov_b32 s2, -1
11191; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
11192; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
11193; GFX1164-NEXT:    s_endpgm
11194;
11195; GFX1132-LABEL: max_i64_constant:
11196; GFX1132:       ; %bb.0: ; %entry
11197; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11198; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11199; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
11200; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
11201; GFX1132-NEXT:    s_and_saveexec_b32 s0, vcc_lo
11202; GFX1132-NEXT:    s_cbranch_execz .LBB22_2
11203; GFX1132-NEXT:  ; %bb.1:
11204; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
11205; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
11206; GFX1132-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
11207; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
11208; GFX1132-NEXT:    buffer_gl0_inv
11209; GFX1132-NEXT:  .LBB22_2:
11210; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11211; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
11212; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
11213; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
11214; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
11215; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
11216; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11217; GFX1132-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
11218; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
11219; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
11220; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
11221; GFX1132-NEXT:    s_mov_b32 s2, -1
11222; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
11223; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
11224; GFX1132-NEXT:    s_endpgm
11225entry:
11226  %old = atomicrmw max ptr addrspace(3) @local_var64, i64 5 acq_rel
11227  store i64 %old, ptr addrspace(1) %out
11228  ret void
11229}
11230
11231define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
11232; GFX7LESS_ITERATIVE-LABEL: max_i64_varying:
11233; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
11234; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
11235; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
11236; GFX7LESS_ITERATIVE-NEXT:    s_brev_b32 s1, 1
11237; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s0, 0
11238; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
11239; GFX7LESS_ITERATIVE-NEXT:  .LBB23_1: ; %ComputeLoop
11240; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
11241; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
11242; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s8
11243; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
11244; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
11245; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
11246; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
11247; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
11248; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
11249; GFX7LESS_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5]
11250; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
11251; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
11252; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
11253; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
11254; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
11255; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
11256; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
11257; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB23_1
11258; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
11259; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
11260; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
11261; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
11262; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
11263; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
11264; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
11265; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB23_4
11266; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
11267; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
11268; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
11269; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
11270; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
11271; GFX7LESS_ITERATIVE-NEXT:    ds_max_rtn_i64 v[3:4], v0, v[3:4]
11272; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11273; GFX7LESS_ITERATIVE-NEXT:  .LBB23_4:
11274; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
11275; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
11276; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
11277; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
11278; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
11279; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
11280; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
11281; GFX7LESS_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2]
11282; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
11283; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
11284; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
11285; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11286; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
11287; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
11288;
11289; GFX8_ITERATIVE-LABEL: max_i64_varying:
11290; GFX8_ITERATIVE:       ; %bb.0: ; %entry
11291; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
11292; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
11293; GFX8_ITERATIVE-NEXT:    s_brev_b32 s1, 1
11294; GFX8_ITERATIVE-NEXT:    s_mov_b32 s0, 0
11295; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
11296; GFX8_ITERATIVE-NEXT:  .LBB23_1: ; %ComputeLoop
11297; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
11298; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
11299; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
11300; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
11301; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
11302; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
11303; GFX8_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5]
11304; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
11305; GFX8_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
11306; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
11307; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
11308; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
11309; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
11310; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
11311; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
11312; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
11313; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
11314; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
11315; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11316; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
11317; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
11318; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
11319; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
11320; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
11321; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB23_4
11322; GFX8_ITERATIVE-NEXT:  ; %bb.3:
11323; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
11324; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
11325; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
11326; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
11327; GFX8_ITERATIVE-NEXT:    ds_max_rtn_i64 v[3:4], v0, v[3:4]
11328; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11329; GFX8_ITERATIVE-NEXT:  .LBB23_4:
11330; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
11331; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
11332; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
11333; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
11334; GFX8_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2]
11335; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
11336; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
11337; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
11338; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
11339; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
11340; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
11341; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11342; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
11343; GFX8_ITERATIVE-NEXT:    s_endpgm
11344;
11345; GFX9_ITERATIVE-LABEL: max_i64_varying:
11346; GFX9_ITERATIVE:       ; %bb.0: ; %entry
11347; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
11348; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
11349; GFX9_ITERATIVE-NEXT:    s_brev_b32 s1, 1
11350; GFX9_ITERATIVE-NEXT:    s_mov_b32 s0, 0
11351; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
11352; GFX9_ITERATIVE-NEXT:  .LBB23_1: ; %ComputeLoop
11353; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
11354; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
11355; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
11356; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
11357; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
11358; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
11359; GFX9_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5]
11360; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
11361; GFX9_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
11362; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
11363; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
11364; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
11365; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
11366; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
11367; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
11368; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
11369; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
11370; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
11371; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11372; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
11373; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
11374; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
11375; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
11376; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
11377; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB23_4
11378; GFX9_ITERATIVE-NEXT:  ; %bb.3:
11379; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
11380; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
11381; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
11382; GFX9_ITERATIVE-NEXT:    ds_max_rtn_i64 v[3:4], v0, v[3:4]
11383; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11384; GFX9_ITERATIVE-NEXT:  .LBB23_4:
11385; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
11386; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
11387; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
11388; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
11389; GFX9_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2]
11390; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
11391; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
11392; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
11393; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
11394; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
11395; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
11396; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11397; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
11398; GFX9_ITERATIVE-NEXT:    s_endpgm
11399;
11400; GFX1064_ITERATIVE-LABEL: max_i64_varying:
11401; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
11402; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
11403; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
11404; GFX1064_ITERATIVE-NEXT:    s_brev_b32 s1, 1
11405; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s0, 0
11406; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
11407; GFX1064_ITERATIVE-NEXT:  .LBB23_1: ; %ComputeLoop
11408; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
11409; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
11410; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
11411; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
11412; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
11413; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
11414; GFX1064_ITERATIVE-NEXT:    v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7]
11415; GFX1064_ITERATIVE-NEXT:    s_and_b64 s[8:9], s[8:9], exec
11416; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
11417; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
11418; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
11419; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
11420; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
11421; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
11422; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
11423; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11424; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
11425; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
11426; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
11427; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
11428; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
11429; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB23_4
11430; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
11431; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
11432; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
11433; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
11434; GFX1064_ITERATIVE-NEXT:    ds_max_rtn_i64 v[3:4], v0, v[3:4]
11435; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11436; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
11437; GFX1064_ITERATIVE-NEXT:  .LBB23_4:
11438; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
11439; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
11440; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
11441; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
11442; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
11443; GFX1064_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[1:2]
11444; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc
11445; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc
11446; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
11447; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
11448; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11449; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
11450; GFX1064_ITERATIVE-NEXT:    s_endpgm
11451;
11452; GFX1032_ITERATIVE-LABEL: max_i64_varying:
11453; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
11454; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
11455; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
11456; GFX1032_ITERATIVE-NEXT:    s_brev_b32 s1, 1
11457; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, 0
11458; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
11459; GFX1032_ITERATIVE-NEXT:  .LBB23_1: ; %ComputeLoop
11460; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
11461; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
11462; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
11463; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
11464; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
11465; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
11466; GFX1032_ITERATIVE-NEXT:    v_cmp_gt_i64_e64 s8, s[0:1], s[6:7]
11467; GFX1032_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
11468; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
11469; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
11470; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
11471; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
11472; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
11473; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
11474; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
11475; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11476; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
11477; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
11478; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
11479; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
11480; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB23_4
11481; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
11482; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
11483; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
11484; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
11485; GFX1032_ITERATIVE-NEXT:    ds_max_rtn_i64 v[3:4], v0, v[3:4]
11486; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11487; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
11488; GFX1032_ITERATIVE-NEXT:  .LBB23_4:
11489; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
11490; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
11491; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
11492; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
11493; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
11494; GFX1032_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[1:2]
11495; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc_lo
11496; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
11497; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
11498; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
11499; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11500; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
11501; GFX1032_ITERATIVE-NEXT:    s_endpgm
11502;
11503; GFX1164_ITERATIVE-LABEL: max_i64_varying:
11504; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
11505; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
11506; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
11507; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
11508; GFX1164_ITERATIVE-NEXT:    s_brev_b32 s1, 1
11509; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s0, 0
11510; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
11511; GFX1164_ITERATIVE-NEXT:    .p2align 6
11512; GFX1164_ITERATIVE-NEXT:  .LBB23_1: ; %ComputeLoop
11513; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
11514; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
11515; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
11516; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
11517; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
11518; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
11519; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
11520; GFX1164_ITERATIVE-NEXT:    v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7]
11521; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
11522; GFX1164_ITERATIVE-NEXT:    s_and_b64 s[8:9], s[8:9], exec
11523; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
11524; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
11525; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
11526; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
11527; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11528; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
11529; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
11530; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
11531; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
11532; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11533; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
11534; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
11535; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
11536; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
11537; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11538; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
11539; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB23_4
11540; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
11541; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s1
11542; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
11543; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
11544; GFX1164_ITERATIVE-NEXT:    ds_max_rtn_i64 v[2:3], v4, v[2:3]
11545; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11546; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
11547; GFX1164_ITERATIVE-NEXT:  .LBB23_4:
11548; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
11549; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
11550; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
11551; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
11552; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11553; GFX1164_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
11554; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
11555; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
11556; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
11557; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
11558; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11559; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
11560; GFX1164_ITERATIVE-NEXT:    s_endpgm
11561;
11562; GFX1132_ITERATIVE-LABEL: max_i64_varying:
11563; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
11564; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
11565; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
11566; GFX1132_ITERATIVE-NEXT:    s_brev_b32 s1, 1
11567; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, 0
11568; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
11569; GFX1132_ITERATIVE-NEXT:    .p2align 6
11570; GFX1132_ITERATIVE-NEXT:  .LBB23_1: ; %ComputeLoop
11571; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
11572; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
11573; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
11574; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
11575; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
11576; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
11577; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
11578; GFX1132_ITERATIVE-NEXT:    v_cmp_gt_i64_e64 s8, s[0:1], s[6:7]
11579; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
11580; GFX1132_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
11581; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
11582; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
11583; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
11584; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
11585; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11586; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
11587; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
11588; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
11589; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
11590; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
11591; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
11592; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
11593; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
11594; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
11595; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB23_4
11596; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
11597; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1
11598; GFX1132_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
11599; GFX1132_ITERATIVE-NEXT:    ds_max_rtn_i64 v[2:3], v4, v[2:3]
11600; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11601; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
11602; GFX1132_ITERATIVE-NEXT:  .LBB23_4:
11603; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
11604; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
11605; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
11606; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
11607; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11608; GFX1132_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
11609; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
11610; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
11611; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
11612; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
11613; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
11614; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
11615; GFX1132_ITERATIVE-NEXT:    s_endpgm
11616;
11617; GFX7LESS_DPP-LABEL: max_i64_varying:
11618; GFX7LESS_DPP:       ; %bb.0: ; %entry
11619; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
11620; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
11621; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
11622; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
11623; GFX7LESS_DPP-NEXT:    ds_max_rtn_i64 v[0:1], v1, v[0:1]
11624; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
11625; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
11626; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
11627; GFX7LESS_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
11628; GFX7LESS_DPP-NEXT:    s_endpgm
11629;
11630; GFX8_DPP-LABEL: max_i64_varying:
11631; GFX8_DPP:       ; %bb.0: ; %entry
11632; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
11633; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
11634; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
11635; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
11636; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v2, 1
11637; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, v2, 0, s[0:1]
11638; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
11639; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11640; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
11641; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
11642; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
11643; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
11644; GFX8_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
11645; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
11646; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
11647; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11648; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
11649; GFX8_DPP-NEXT:    s_nop 0
11650; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
11651; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
11652; GFX8_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
11653; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
11654; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
11655; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11656; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
11657; GFX8_DPP-NEXT:    s_nop 0
11658; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
11659; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
11660; GFX8_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
11661; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
11662; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
11663; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11664; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
11665; GFX8_DPP-NEXT:    s_nop 0
11666; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
11667; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
11668; GFX8_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
11669; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
11670; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
11671; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11672; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
11673; GFX8_DPP-NEXT:    s_nop 0
11674; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
11675; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
11676; GFX8_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
11677; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
11678; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
11679; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11680; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, 0
11681; GFX8_DPP-NEXT:    s_nop 0
11682; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
11683; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
11684; GFX8_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
11685; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
11686; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
11687; GFX8_DPP-NEXT:    v_readlane_b32 s3, v4, 63
11688; GFX8_DPP-NEXT:    v_readlane_b32 s2, v3, 63
11689; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
11690; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
11691; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
11692; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
11693; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
11694; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
11695; GFX8_DPP-NEXT:    s_cbranch_execz .LBB23_2
11696; GFX8_DPP-NEXT:  ; %bb.1:
11697; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s3
11698; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s2
11699; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
11700; GFX8_DPP-NEXT:    ds_max_rtn_i64 v[7:8], v9, v[7:8]
11701; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
11702; GFX8_DPP-NEXT:  .LBB23_2:
11703; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
11704; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
11705; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v8
11706; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v7
11707; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
11708; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
11709; GFX8_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
11710; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
11711; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
11712; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
11713; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
11714; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
11715; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
11716; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
11717; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
11718; GFX8_DPP-NEXT:    s_endpgm
11719;
11720; GFX9_DPP-LABEL: max_i64_varying:
11721; GFX9_DPP:       ; %bb.0: ; %entry
11722; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
11723; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
11724; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
11725; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
11726; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v2, 1
11727; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, v2, 0, s[0:1]
11728; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[0:1]
11729; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11730; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
11731; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
11732; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
11733; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
11734; GFX9_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
11735; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
11736; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
11737; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11738; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
11739; GFX9_DPP-NEXT:    s_nop 0
11740; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
11741; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
11742; GFX9_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
11743; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
11744; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
11745; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11746; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
11747; GFX9_DPP-NEXT:    s_nop 0
11748; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
11749; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
11750; GFX9_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
11751; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
11752; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
11753; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11754; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
11755; GFX9_DPP-NEXT:    s_nop 0
11756; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
11757; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
11758; GFX9_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
11759; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
11760; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
11761; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11762; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
11763; GFX9_DPP-NEXT:    s_nop 0
11764; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
11765; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
11766; GFX9_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
11767; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
11768; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
11769; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11770; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, 0
11771; GFX9_DPP-NEXT:    s_nop 0
11772; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
11773; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
11774; GFX9_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
11775; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
11776; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
11777; GFX9_DPP-NEXT:    v_readlane_b32 s3, v4, 63
11778; GFX9_DPP-NEXT:    v_readlane_b32 s2, v3, 63
11779; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
11780; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
11781; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
11782; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
11783; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
11784; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
11785; GFX9_DPP-NEXT:    s_cbranch_execz .LBB23_2
11786; GFX9_DPP-NEXT:  ; %bb.1:
11787; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s3
11788; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s2
11789; GFX9_DPP-NEXT:    ds_max_rtn_i64 v[7:8], v9, v[7:8]
11790; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
11791; GFX9_DPP-NEXT:  .LBB23_2:
11792; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
11793; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
11794; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v8
11795; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v7
11796; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
11797; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
11798; GFX9_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
11799; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
11800; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
11801; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
11802; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
11803; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
11804; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
11805; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
11806; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
11807; GFX9_DPP-NEXT:    s_endpgm
11808;
11809; GFX1064_DPP-LABEL: max_i64_varying:
11810; GFX1064_DPP:       ; %bb.0: ; %entry
11811; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
11812; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, 0, s[0:1]
11813; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
11814; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
11815; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
11816; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11817; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
11818; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
11819; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
11820; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
11821; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
11822; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
11823; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
11824; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
11825; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
11826; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
11827; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
11828; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
11829; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
11830; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11831; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
11832; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
11833; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
11834; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
11835; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
11836; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
11837; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
11838; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
11839; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
11840; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
11841; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
11842; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
11843; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
11844; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
11845; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
11846; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
11847; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
11848; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
11849; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
11850; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
11851; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
11852; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
11853; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 31
11854; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 31
11855; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, s2
11856; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s3
11857; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
11858; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
11859; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v5, 1
11860; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
11861; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
11862; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
11863; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
11864; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
11865; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11866; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
11867; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
11868; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 15
11869; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
11870; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 15
11871; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 31
11872; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
11873; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s2, 16
11874; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 63
11875; GFX1064_DPP-NEXT:    v_writelane_b32 v4, s3, 16
11876; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v2, 47
11877; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 63
11878; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v1, 47
11879; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s6, 32
11880; GFX1064_DPP-NEXT:    v_writelane_b32 v4, s7, 32
11881; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
11882; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
11883; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
11884; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
11885; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
11886; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s8, 48
11887; GFX1064_DPP-NEXT:    v_writelane_b32 v4, s9, 48
11888; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
11889; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
11890; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
11891; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
11892; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
11893; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB23_2
11894; GFX1064_DPP-NEXT:  ; %bb.1:
11895; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
11896; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
11897; GFX1064_DPP-NEXT:    ds_max_rtn_i64 v[7:8], v0, v[7:8]
11898; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
11899; GFX1064_DPP-NEXT:    buffer_gl0_inv
11900; GFX1064_DPP-NEXT:  .LBB23_2:
11901; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
11902; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
11903; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
11904; GFX1064_DPP-NEXT:    s_mov_b32 null, 0
11905; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s5, v8
11906; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s4, v7
11907; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v4
11908; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
11909; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
11910; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
11911; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
11912; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
11913; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
11914; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
11915; GFX1064_DPP-NEXT:    s_endpgm
11916;
11917; GFX1032_DPP-LABEL: max_i64_varying:
11918; GFX1032_DPP:       ; %bb.0: ; %entry
11919; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
11920; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, 0, s2
11921; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
11922; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
11923; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s2
11924; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11925; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
11926; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
11927; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
11928; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4]
11929; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
11930; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
11931; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
11932; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
11933; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
11934; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
11935; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[5:6]
11936; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
11937; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
11938; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
11939; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
11940; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
11941; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
11942; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4]
11943; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
11944; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
11945; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
11946; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
11947; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
11948; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
11949; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[5:6]
11950; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
11951; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
11952; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
11953; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
11954; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
11955; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
11956; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v5, 1
11957; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4]
11958; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
11959; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
11960; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
11961; GFX1032_DPP-NEXT:    v_readlane_b32 s3, v2, 15
11962; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
11963; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
11964; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
11965; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
11966; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v1, 15
11967; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
11968; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
11969; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
11970; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
11971; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s3, 16
11972; GFX1032_DPP-NEXT:    v_writelane_b32 v4, s6, 16
11973; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
11974; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
11975; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
11976; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
11977; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
11978; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB23_2
11979; GFX1032_DPP-NEXT:  ; %bb.1:
11980; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, s1
11981; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, s0
11982; GFX1032_DPP-NEXT:    ds_max_rtn_i64 v[7:8], v0, v[7:8]
11983; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
11984; GFX1032_DPP-NEXT:    buffer_gl0_inv
11985; GFX1032_DPP-NEXT:  .LBB23_2:
11986; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
11987; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
11988; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
11989; GFX1032_DPP-NEXT:    s_mov_b32 null, 0
11990; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s5, v8
11991; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s4, v7
11992; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v4
11993; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
11994; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
11995; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[7:8]
11996; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
11997; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
11998; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
11999; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
12000; GFX1032_DPP-NEXT:    s_endpgm
12001;
12002; GFX1164_DPP-LABEL: max_i64_varying:
12003; GFX1164_DPP:       ; %bb.0: ; %entry
12004; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
12005; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
12006; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12007; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, 0, s[0:1]
12008; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
12009; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
12010; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
12011; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
12012; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
12013; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
12014; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
12015; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
12016; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
12017; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
12018; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
12019; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
12020; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
12021; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
12022; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
12023; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
12024; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
12025; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
12026; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
12027; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
12028; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
12029; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
12030; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
12031; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
12032; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
12033; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
12034; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
12035; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
12036; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
12037; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
12038; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
12039; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
12040; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
12041; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
12042; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
12043; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
12044; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
12045; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
12046; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
12047; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
12048; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
12049; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
12050; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
12051; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
12052; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
12053; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
12054; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
12055; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
12056; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
12057; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 31
12058; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 31
12059; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
12060; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, s2
12061; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s3
12062; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
12063; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
12064; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
12065; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v5, 1
12066; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
12067; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
12068; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
12069; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
12070; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
12071; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
12072; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12073; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
12074; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
12075; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 15
12076; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
12077; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 15
12078; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 31
12079; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
12080; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s2, 16
12081; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 63
12082; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s3, 16
12083; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v2, 47
12084; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v2, 63
12085; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v1, 47
12086; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s6, 32
12087; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s7, 32
12088; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
12089; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12090; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
12091; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
12092; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
12093; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
12094; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s8, 48
12095; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s9, 48
12096; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
12097; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
12098; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
12099; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
12100; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
12101; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB23_2
12102; GFX1164_DPP-NEXT:  ; %bb.1:
12103; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
12104; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
12105; GFX1164_DPP-NEXT:    ds_max_rtn_i64 v[7:8], v0, v[7:8]
12106; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12107; GFX1164_DPP-NEXT:    buffer_gl0_inv
12108; GFX1164_DPP-NEXT:  .LBB23_2:
12109; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
12110; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
12111; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s5, v8
12112; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s4, v7
12113; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v4
12114; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
12115; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
12116; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12117; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
12118; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
12119; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
12120; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12121; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
12122; GFX1164_DPP-NEXT:    s_endpgm
12123;
12124; GFX1132_DPP-LABEL: max_i64_varying:
12125; GFX1132_DPP:       ; %bb.0: ; %entry
12126; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
12127; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
12128; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
12129; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, 0, s2
12130; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
12131; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s2
12132; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
12133; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
12134; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
12135; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
12136; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
12137; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
12138; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4]
12139; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1
12140; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
12141; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
12142; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
12143; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
12144; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12145; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
12146; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[5:6]
12147; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1
12148; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v6, 1
12149; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, 0
12150; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
12151; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
12152; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
12153; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
12154; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4]
12155; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1
12156; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v4, 1
12157; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
12158; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
12159; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
12160; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
12161; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[5:6]
12162; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1
12163; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
12164; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
12165; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
12166; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
12167; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
12168; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
12169; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v5, 1
12170; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
12171; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4]
12172; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
12173; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v1, v3, v1
12174; GFX1132_DPP-NEXT:    v_readlane_b32 s3, v2, 15
12175; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
12176; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
12177; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
12178; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
12179; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
12180; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
12181; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
12182; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
12183; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
12184; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
12185; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s3, 16
12186; GFX1132_DPP-NEXT:    v_writelane_b32 v4, s6, 16
12187; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
12188; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
12189; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
12190; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
12191; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
12192; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB23_2
12193; GFX1132_DPP-NEXT:  ; %bb.1:
12194; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
12195; GFX1132_DPP-NEXT:    ds_max_rtn_i64 v[7:8], v0, v[7:8]
12196; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12197; GFX1132_DPP-NEXT:    buffer_gl0_inv
12198; GFX1132_DPP-NEXT:  .LBB23_2:
12199; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
12200; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
12201; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s5, v8
12202; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
12203; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v4
12204; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
12205; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
12206; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12207; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[7:8]
12208; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
12209; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
12210; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12211; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
12212; GFX1132_DPP-NEXT:    s_endpgm
12213entry:
12214  %lane = call i32 @llvm.amdgcn.workitem.id.x()
12215  %lane_ext = zext i32 %lane to i64
12216  %old = atomicrmw max ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
12217  store i64 %old, ptr addrspace(1) %out
12218  ret void
12219}
12220
12221define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
12222; GFX7LESS_ITERATIVE-LABEL: min_i32_varying:
12223; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
12224; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
12225; GFX7LESS_ITERATIVE-NEXT:    s_brev_b32 s2, -2
12226; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
12227; GFX7LESS_ITERATIVE-NEXT:  .LBB24_1: ; %ComputeLoop
12228; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
12229; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
12230; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s3
12231; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
12232; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
12233; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
12234; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
12235; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
12236; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
12237; GFX7LESS_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
12238; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB24_1
12239; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
12240; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
12241; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
12242; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
12243; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
12244; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
12245; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
12246; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB24_4
12247; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
12248; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
12249; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
12250; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
12251; GFX7LESS_ITERATIVE-NEXT:    ds_min_rtn_i32 v0, v0, v2
12252; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12253; GFX7LESS_ITERATIVE-NEXT:  .LBB24_4:
12254; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
12255; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
12256; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
12257; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
12258; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
12259; GFX7LESS_ITERATIVE-NEXT:    v_min_i32_e32 v0, s4, v1
12260; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12261; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
12262; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
12263;
12264; GFX8_ITERATIVE-LABEL: min_i32_varying:
12265; GFX8_ITERATIVE:       ; %bb.0: ; %entry
12266; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
12267; GFX8_ITERATIVE-NEXT:    s_brev_b32 s2, -2
12268; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
12269; GFX8_ITERATIVE-NEXT:  .LBB24_1: ; %ComputeLoop
12270; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
12271; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
12272; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
12273; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
12274; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
12275; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
12276; GFX8_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
12277; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
12278; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
12279; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
12280; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
12281; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12282; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
12283; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
12284; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
12285; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
12286; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
12287; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB24_4
12288; GFX8_ITERATIVE-NEXT:  ; %bb.3:
12289; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
12290; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
12291; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
12292; GFX8_ITERATIVE-NEXT:    ds_min_rtn_i32 v0, v0, v2
12293; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12294; GFX8_ITERATIVE-NEXT:  .LBB24_4:
12295; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
12296; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
12297; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
12298; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
12299; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
12300; GFX8_ITERATIVE-NEXT:    v_min_i32_e32 v0, s4, v1
12301; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12302; GFX8_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
12303; GFX8_ITERATIVE-NEXT:    s_endpgm
12304;
12305; GFX9_ITERATIVE-LABEL: min_i32_varying:
12306; GFX9_ITERATIVE:       ; %bb.0: ; %entry
12307; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
12308; GFX9_ITERATIVE-NEXT:    s_brev_b32 s2, -2
12309; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
12310; GFX9_ITERATIVE-NEXT:  .LBB24_1: ; %ComputeLoop
12311; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
12312; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
12313; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
12314; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
12315; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
12316; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
12317; GFX9_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
12318; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
12319; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
12320; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
12321; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
12322; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12323; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
12324; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
12325; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
12326; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
12327; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
12328; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB24_4
12329; GFX9_ITERATIVE-NEXT:  ; %bb.3:
12330; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
12331; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
12332; GFX9_ITERATIVE-NEXT:    ds_min_rtn_i32 v0, v0, v2
12333; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12334; GFX9_ITERATIVE-NEXT:  .LBB24_4:
12335; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
12336; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
12337; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
12338; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
12339; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
12340; GFX9_ITERATIVE-NEXT:    v_min_i32_e32 v0, s4, v1
12341; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12342; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
12343; GFX9_ITERATIVE-NEXT:    s_endpgm
12344;
12345; GFX1064_ITERATIVE-LABEL: min_i32_varying:
12346; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
12347; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
12348; GFX1064_ITERATIVE-NEXT:    s_brev_b32 s2, -2
12349; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
12350; GFX1064_ITERATIVE-NEXT:  .LBB24_1: ; %ComputeLoop
12351; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
12352; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
12353; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
12354; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
12355; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
12356; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
12357; GFX1064_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
12358; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
12359; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
12360; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
12361; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12362; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
12363; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
12364; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
12365; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
12366; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
12367; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB24_4
12368; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
12369; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
12370; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
12371; GFX1064_ITERATIVE-NEXT:    ds_min_rtn_i32 v0, v0, v2
12372; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12373; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
12374; GFX1064_ITERATIVE-NEXT:  .LBB24_4:
12375; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
12376; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
12377; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
12378; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
12379; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
12380; GFX1064_ITERATIVE-NEXT:    v_min_i32_e32 v0, s2, v1
12381; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
12382; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12383; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
12384; GFX1064_ITERATIVE-NEXT:    s_endpgm
12385;
12386; GFX1032_ITERATIVE-LABEL: min_i32_varying:
12387; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
12388; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
12389; GFX1032_ITERATIVE-NEXT:    s_brev_b32 s0, -2
12390; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
12391; GFX1032_ITERATIVE-NEXT:  .LBB24_1: ; %ComputeLoop
12392; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
12393; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
12394; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
12395; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
12396; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
12397; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
12398; GFX1032_ITERATIVE-NEXT:    s_min_i32 s0, s0, s3
12399; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
12400; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
12401; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
12402; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12403; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
12404; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
12405; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
12406; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
12407; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB24_4
12408; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
12409; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
12410; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
12411; GFX1032_ITERATIVE-NEXT:    ds_min_rtn_i32 v0, v0, v2
12412; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12413; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
12414; GFX1032_ITERATIVE-NEXT:  .LBB24_4:
12415; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
12416; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
12417; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
12418; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
12419; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
12420; GFX1032_ITERATIVE-NEXT:    v_min_i32_e32 v0, s2, v1
12421; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
12422; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12423; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
12424; GFX1032_ITERATIVE-NEXT:    s_endpgm
12425;
12426; GFX1164_ITERATIVE-LABEL: min_i32_varying:
12427; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
12428; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
12429; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
12430; GFX1164_ITERATIVE-NEXT:    s_brev_b32 s2, -2
12431; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
12432; GFX1164_ITERATIVE-NEXT:  .LBB24_1: ; %ComputeLoop
12433; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
12434; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
12435; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
12436; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
12437; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
12438; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
12439; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
12440; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
12441; GFX1164_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
12442; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
12443; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
12444; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
12445; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
12446; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12447; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
12448; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
12449; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
12450; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
12451; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12452; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
12453; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB24_4
12454; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
12455; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
12456; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
12457; GFX1164_ITERATIVE-NEXT:    ds_min_rtn_i32 v1, v1, v2
12458; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12459; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
12460; GFX1164_ITERATIVE-NEXT:  .LBB24_4:
12461; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
12462; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
12463; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
12464; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
12465; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12466; GFX1164_ITERATIVE-NEXT:    v_min_i32_e32 v0, s2, v0
12467; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
12468; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12469; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
12470; GFX1164_ITERATIVE-NEXT:    s_endpgm
12471;
12472; GFX1132_ITERATIVE-LABEL: min_i32_varying:
12473; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
12474; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
12475; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
12476; GFX1132_ITERATIVE-NEXT:    s_brev_b32 s0, -2
12477; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
12478; GFX1132_ITERATIVE-NEXT:  .LBB24_1: ; %ComputeLoop
12479; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
12480; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
12481; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
12482; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
12483; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
12484; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
12485; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
12486; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
12487; GFX1132_ITERATIVE-NEXT:    s_min_i32 s0, s0, s3
12488; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
12489; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
12490; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
12491; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
12492; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
12493; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
12494; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
12495; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
12496; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
12497; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB24_4
12498; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
12499; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
12500; GFX1132_ITERATIVE-NEXT:    ds_min_rtn_i32 v1, v1, v2
12501; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12502; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
12503; GFX1132_ITERATIVE-NEXT:  .LBB24_4:
12504; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
12505; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
12506; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
12507; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
12508; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12509; GFX1132_ITERATIVE-NEXT:    v_min_i32_e32 v0, s2, v0
12510; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
12511; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
12512; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
12513; GFX1132_ITERATIVE-NEXT:    s_endpgm
12514;
12515; GFX7LESS_DPP-LABEL: min_i32_varying:
12516; GFX7LESS_DPP:       ; %bb.0: ; %entry
12517; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
12518; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
12519; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
12520; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12521; GFX7LESS_DPP-NEXT:    ds_min_rtn_i32 v0, v1, v0
12522; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12523; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
12524; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
12525; GFX7LESS_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
12526; GFX7LESS_DPP-NEXT:    s_endpgm
12527;
12528; GFX8_DPP-LABEL: min_i32_varying:
12529; GFX8_DPP:       ; %bb.0: ; %entry
12530; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
12531; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
12532; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
12533; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
12534; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
12535; GFX8_DPP-NEXT:    s_nop 1
12536; GFX8_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
12537; GFX8_DPP-NEXT:    s_nop 1
12538; GFX8_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
12539; GFX8_DPP-NEXT:    s_nop 1
12540; GFX8_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
12541; GFX8_DPP-NEXT:    s_nop 1
12542; GFX8_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
12543; GFX8_DPP-NEXT:    s_nop 1
12544; GFX8_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
12545; GFX8_DPP-NEXT:    s_nop 1
12546; GFX8_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
12547; GFX8_DPP-NEXT:    v_readlane_b32 s2, v2, 63
12548; GFX8_DPP-NEXT:    s_nop 0
12549; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
12550; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
12551; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
12552; GFX8_DPP-NEXT:    ; implicit-def: $vgpr0
12553; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
12554; GFX8_DPP-NEXT:    s_cbranch_execz .LBB24_2
12555; GFX8_DPP-NEXT:  ; %bb.1:
12556; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, 0
12557; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, s2
12558; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
12559; GFX8_DPP-NEXT:    ds_min_rtn_i32 v0, v0, v3
12560; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12561; GFX8_DPP-NEXT:  .LBB24_2:
12562; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
12563; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
12564; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v0
12565; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, v1
12566; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
12567; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
12568; GFX8_DPP-NEXT:    v_min_i32_e32 v0, s4, v0
12569; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12570; GFX8_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
12571; GFX8_DPP-NEXT:    s_endpgm
12572;
12573; GFX9_DPP-LABEL: min_i32_varying:
12574; GFX9_DPP:       ; %bb.0: ; %entry
12575; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
12576; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
12577; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
12578; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
12579; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
12580; GFX9_DPP-NEXT:    s_nop 1
12581; GFX9_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
12582; GFX9_DPP-NEXT:    s_nop 1
12583; GFX9_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
12584; GFX9_DPP-NEXT:    s_nop 1
12585; GFX9_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
12586; GFX9_DPP-NEXT:    s_nop 1
12587; GFX9_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
12588; GFX9_DPP-NEXT:    s_nop 1
12589; GFX9_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
12590; GFX9_DPP-NEXT:    s_nop 1
12591; GFX9_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
12592; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
12593; GFX9_DPP-NEXT:    s_nop 0
12594; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
12595; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
12596; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
12597; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
12598; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
12599; GFX9_DPP-NEXT:    s_cbranch_execz .LBB24_2
12600; GFX9_DPP-NEXT:  ; %bb.1:
12601; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, 0
12602; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, s2
12603; GFX9_DPP-NEXT:    ds_min_rtn_i32 v0, v0, v3
12604; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12605; GFX9_DPP-NEXT:  .LBB24_2:
12606; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
12607; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
12608; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
12609; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v1
12610; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
12611; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
12612; GFX9_DPP-NEXT:    v_min_i32_e32 v0, s4, v0
12613; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12614; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
12615; GFX9_DPP-NEXT:    s_endpgm
12616;
12617; GFX1064_DPP-LABEL: min_i32_varying:
12618; GFX1064_DPP:       ; %bb.0: ; %entry
12619; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
12620; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
12621; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v3, -2
12622; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
12623; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
12624; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
12625; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
12626; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
12627; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
12628; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
12629; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s2
12630; GFX1064_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
12631; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
12632; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 15
12633; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 31
12634; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 16
12635; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
12636; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12637; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
12638; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 47
12639; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 63
12640; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s3, 32
12641; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
12642; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
12643; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
12644; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 48
12645; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
12646; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
12647; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
12648; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
12649; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
12650; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB24_2
12651; GFX1064_DPP-NEXT:  ; %bb.1:
12652; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
12653; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, s6
12654; GFX1064_DPP-NEXT:    s_mov_b32 s3, s6
12655; GFX1064_DPP-NEXT:    ds_min_rtn_i32 v0, v0, v4
12656; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12657; GFX1064_DPP-NEXT:    buffer_gl0_inv
12658; GFX1064_DPP-NEXT:  .LBB24_2:
12659; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
12660; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
12661; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
12662; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v0
12663; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
12664; GFX1064_DPP-NEXT:    v_min_i32_e32 v0, s3, v0
12665; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
12666; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12667; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
12668; GFX1064_DPP-NEXT:    s_endpgm
12669;
12670; GFX1032_DPP-LABEL: min_i32_varying:
12671; GFX1032_DPP:       ; %bb.0: ; %entry
12672; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
12673; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
12674; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v3, -2
12675; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
12676; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
12677; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
12678; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
12679; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
12680; GFX1032_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
12681; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
12682; GFX1032_DPP-NEXT:    v_readlane_b32 s2, v1, 31
12683; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
12684; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
12685; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12686; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
12687; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
12688; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
12689; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
12690; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
12691; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
12692; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
12693; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
12694; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB24_2
12695; GFX1032_DPP-NEXT:  ; %bb.1:
12696; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
12697; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, s0
12698; GFX1032_DPP-NEXT:    ds_min_rtn_i32 v0, v0, v4
12699; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12700; GFX1032_DPP-NEXT:    buffer_gl0_inv
12701; GFX1032_DPP-NEXT:  .LBB24_2:
12702; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
12703; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
12704; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
12705; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v0
12706; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
12707; GFX1032_DPP-NEXT:    v_min_i32_e32 v0, s3, v0
12708; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
12709; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12710; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
12711; GFX1032_DPP-NEXT:    s_endpgm
12712;
12713; GFX1164_DPP-LABEL: min_i32_varying:
12714; GFX1164_DPP:       ; %bb.0: ; %entry
12715; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
12716; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
12717; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
12718; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
12719; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v3, -2
12720; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
12721; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
12722; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
12723; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12724; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
12725; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
12726; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12727; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
12728; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
12729; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12730; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
12731; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s2
12732; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12733; GFX1164_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
12734; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
12735; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 15
12736; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 31
12737; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
12738; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 16
12739; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
12740; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12741; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
12742; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 47
12743; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 63
12744; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s3, 32
12745; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
12746; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
12747; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
12748; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
12749; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 48
12750; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
12751; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
12752; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
12753; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr0
12754; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
12755; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB24_2
12756; GFX1164_DPP-NEXT:  ; %bb.1:
12757; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
12758; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s6
12759; GFX1164_DPP-NEXT:    s_mov_b32 s3, s6
12760; GFX1164_DPP-NEXT:    ds_min_rtn_i32 v0, v0, v4
12761; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12762; GFX1164_DPP-NEXT:    buffer_gl0_inv
12763; GFX1164_DPP-NEXT:  .LBB24_2:
12764; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
12765; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
12766; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v0
12767; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, v3
12768; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12769; GFX1164_DPP-NEXT:    v_min_i32_e32 v0, s3, v0
12770; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
12771; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12772; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
12773; GFX1164_DPP-NEXT:    s_endpgm
12774;
12775; GFX1132_DPP-LABEL: min_i32_varying:
12776; GFX1132_DPP:       ; %bb.0: ; %entry
12777; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
12778; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
12779; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
12780; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
12781; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v3, -2
12782; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
12783; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
12784; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
12785; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12786; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
12787; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
12788; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12789; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
12790; GFX1132_DPP-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
12791; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
12792; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
12793; GFX1132_DPP-NEXT:    v_readlane_b32 s2, v1, 31
12794; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
12795; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
12796; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12797; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
12798; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
12799; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
12800; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
12801; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
12802; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
12803; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
12804; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
12805; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB24_2
12806; GFX1132_DPP-NEXT:  ; %bb.1:
12807; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
12808; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, s0
12809; GFX1132_DPP-NEXT:    ds_min_rtn_i32 v0, v0, v4
12810; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12811; GFX1132_DPP-NEXT:    buffer_gl0_inv
12812; GFX1132_DPP-NEXT:  .LBB24_2:
12813; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
12814; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
12815; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
12816; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
12817; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12818; GFX1132_DPP-NEXT:    v_min_i32_e32 v0, s3, v0
12819; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
12820; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
12821; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
12822; GFX1132_DPP-NEXT:    s_endpgm
12823entry:
12824  %lane = call i32 @llvm.amdgcn.workitem.id.x()
12825  %old = atomicrmw min ptr addrspace(3) @local_var32, i32 %lane acq_rel
12826  store i32 %old, ptr addrspace(1) %out
12827  ret void
12828}
12829
12830define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
12831; GFX7LESS-LABEL: min_i64_constant:
12832; GFX7LESS:       ; %bb.0: ; %entry
12833; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
12834; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
12835; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
12836; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
12837; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
12838; GFX7LESS-NEXT:    s_cbranch_execz .LBB25_2
12839; GFX7LESS-NEXT:  ; %bb.1:
12840; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
12841; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
12842; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
12843; GFX7LESS-NEXT:    s_mov_b32 m0, -1
12844; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
12845; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
12846; GFX7LESS-NEXT:  .LBB25_2:
12847; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
12848; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
12849; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
12850; GFX7LESS-NEXT:    s_mov_b32 s2, -1
12851; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
12852; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
12853; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
12854; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
12855; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
12856; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
12857; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
12858; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
12859; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
12860; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
12861; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
12862; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
12863; GFX7LESS-NEXT:    s_endpgm
12864;
12865; GFX8-LABEL: min_i64_constant:
12866; GFX8:       ; %bb.0: ; %entry
12867; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12868; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
12869; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
12870; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
12871; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
12872; GFX8-NEXT:    s_cbranch_execz .LBB25_2
12873; GFX8-NEXT:  ; %bb.1:
12874; GFX8-NEXT:    v_mov_b32_e32 v0, 5
12875; GFX8-NEXT:    v_mov_b32_e32 v1, 0
12876; GFX8-NEXT:    v_mov_b32_e32 v2, 0
12877; GFX8-NEXT:    s_mov_b32 m0, -1
12878; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
12879; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
12880; GFX8-NEXT:  .LBB25_2:
12881; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
12882; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
12883; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
12884; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
12885; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
12886; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
12887; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
12888; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
12889; GFX8-NEXT:    v_mov_b32_e32 v2, s5
12890; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
12891; GFX8-NEXT:    v_mov_b32_e32 v2, s4
12892; GFX8-NEXT:    s_mov_b32 s3, 0xf000
12893; GFX8-NEXT:    s_mov_b32 s2, -1
12894; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
12895; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
12896; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
12897; GFX8-NEXT:    s_endpgm
12898;
12899; GFX9-LABEL: min_i64_constant:
12900; GFX9:       ; %bb.0: ; %entry
12901; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12902; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
12903; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
12904; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
12905; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
12906; GFX9-NEXT:    s_cbranch_execz .LBB25_2
12907; GFX9-NEXT:  ; %bb.1:
12908; GFX9-NEXT:    v_mov_b32_e32 v0, 5
12909; GFX9-NEXT:    v_mov_b32_e32 v1, 0
12910; GFX9-NEXT:    v_mov_b32_e32 v2, 0
12911; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
12912; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
12913; GFX9-NEXT:  .LBB25_2:
12914; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
12915; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
12916; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
12917; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
12918; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
12919; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
12920; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
12921; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
12922; GFX9-NEXT:    v_mov_b32_e32 v2, s5
12923; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
12924; GFX9-NEXT:    v_mov_b32_e32 v2, s4
12925; GFX9-NEXT:    s_mov_b32 s3, 0xf000
12926; GFX9-NEXT:    s_mov_b32 s2, -1
12927; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
12928; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
12929; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
12930; GFX9-NEXT:    s_endpgm
12931;
12932; GFX1064-LABEL: min_i64_constant:
12933; GFX1064:       ; %bb.0: ; %entry
12934; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12935; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
12936; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
12937; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
12938; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
12939; GFX1064-NEXT:    s_cbranch_execz .LBB25_2
12940; GFX1064-NEXT:  ; %bb.1:
12941; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
12942; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
12943; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
12944; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
12945; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
12946; GFX1064-NEXT:    buffer_gl0_inv
12947; GFX1064-NEXT:  .LBB25_2:
12948; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
12949; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
12950; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
12951; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
12952; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
12953; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
12954; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
12955; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
12956; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
12957; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
12958; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
12959; GFX1064-NEXT:    s_mov_b32 s2, -1
12960; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
12961; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
12962; GFX1064-NEXT:    s_endpgm
12963;
12964; GFX1032-LABEL: min_i64_constant:
12965; GFX1032:       ; %bb.0: ; %entry
12966; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12967; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
12968; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
12969; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
12970; GFX1032-NEXT:    s_cbranch_execz .LBB25_2
12971; GFX1032-NEXT:  ; %bb.1:
12972; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
12973; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
12974; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
12975; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
12976; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
12977; GFX1032-NEXT:    buffer_gl0_inv
12978; GFX1032-NEXT:  .LBB25_2:
12979; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
12980; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12981; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
12982; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
12983; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
12984; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
12985; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
12986; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
12987; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
12988; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
12989; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
12990; GFX1032-NEXT:    s_mov_b32 s2, -1
12991; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
12992; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
12993; GFX1032-NEXT:    s_endpgm
12994;
12995; GFX1164-LABEL: min_i64_constant:
12996; GFX1164:       ; %bb.0: ; %entry
12997; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12998; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12999; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
13000; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
13001; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
13002; GFX1164-NEXT:    s_and_saveexec_b64 s[0:1], vcc
13003; GFX1164-NEXT:    s_cbranch_execz .LBB25_2
13004; GFX1164-NEXT:  ; %bb.1:
13005; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
13006; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
13007; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
13008; GFX1164-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
13009; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
13010; GFX1164-NEXT:    buffer_gl0_inv
13011; GFX1164-NEXT:  .LBB25_2:
13012; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
13013; GFX1164-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
13014; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
13015; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
13016; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
13017; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
13018; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13019; GFX1164-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
13020; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
13021; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
13022; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
13023; GFX1164-NEXT:    s_mov_b32 s2, -1
13024; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
13025; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
13026; GFX1164-NEXT:    s_endpgm
13027;
13028; GFX1132-LABEL: min_i64_constant:
13029; GFX1132:       ; %bb.0: ; %entry
13030; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13031; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13032; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
13033; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
13034; GFX1132-NEXT:    s_and_saveexec_b32 s0, vcc_lo
13035; GFX1132-NEXT:    s_cbranch_execz .LBB25_2
13036; GFX1132-NEXT:  ; %bb.1:
13037; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
13038; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
13039; GFX1132-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
13040; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
13041; GFX1132-NEXT:    buffer_gl0_inv
13042; GFX1132-NEXT:  .LBB25_2:
13043; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13044; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
13045; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
13046; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
13047; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
13048; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
13049; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13050; GFX1132-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
13051; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
13052; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
13053; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
13054; GFX1132-NEXT:    s_mov_b32 s2, -1
13055; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
13056; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
13057; GFX1132-NEXT:    s_endpgm
13058entry:
13059  %old = atomicrmw min ptr addrspace(3) @local_var64, i64 5 acq_rel
13060  store i64 %old, ptr addrspace(1) %out
13061  ret void
13062}
13063
13064define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
13065; GFX7LESS_ITERATIVE-LABEL: min_i64_varying:
13066; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
13067; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
13068; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
13069; GFX7LESS_ITERATIVE-NEXT:    s_brev_b32 s1, -2
13070; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s0, -1
13071; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
13072; GFX7LESS_ITERATIVE-NEXT:  .LBB26_1: ; %ComputeLoop
13073; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
13074; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
13075; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s8
13076; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
13077; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
13078; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
13079; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
13080; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
13081; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
13082; GFX7LESS_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
13083; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
13084; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
13085; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
13086; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
13087; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
13088; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
13089; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
13090; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB26_1
13091; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
13092; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
13093; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
13094; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
13095; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
13096; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
13097; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
13098; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB26_4
13099; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
13100; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
13101; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
13102; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
13103; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
13104; GFX7LESS_ITERATIVE-NEXT:    ds_min_rtn_i64 v[3:4], v0, v[3:4]
13105; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13106; GFX7LESS_ITERATIVE-NEXT:  .LBB26_4:
13107; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
13108; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
13109; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
13110; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
13111; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
13112; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
13113; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
13114; GFX7LESS_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2]
13115; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
13116; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
13117; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
13118; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13119; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
13120; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
13121;
13122; GFX8_ITERATIVE-LABEL: min_i64_varying:
13123; GFX8_ITERATIVE:       ; %bb.0: ; %entry
13124; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
13125; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
13126; GFX8_ITERATIVE-NEXT:    s_brev_b32 s1, -2
13127; GFX8_ITERATIVE-NEXT:    s_mov_b32 s0, -1
13128; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
13129; GFX8_ITERATIVE-NEXT:  .LBB26_1: ; %ComputeLoop
13130; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
13131; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
13132; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
13133; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
13134; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
13135; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
13136; GFX8_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
13137; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
13138; GFX8_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
13139; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
13140; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
13141; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
13142; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
13143; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
13144; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
13145; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
13146; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
13147; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
13148; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13149; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
13150; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
13151; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
13152; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
13153; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
13154; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB26_4
13155; GFX8_ITERATIVE-NEXT:  ; %bb.3:
13156; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
13157; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
13158; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
13159; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
13160; GFX8_ITERATIVE-NEXT:    ds_min_rtn_i64 v[3:4], v0, v[3:4]
13161; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13162; GFX8_ITERATIVE-NEXT:  .LBB26_4:
13163; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
13164; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
13165; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
13166; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
13167; GFX8_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2]
13168; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
13169; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
13170; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
13171; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
13172; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
13173; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
13174; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13175; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
13176; GFX8_ITERATIVE-NEXT:    s_endpgm
13177;
13178; GFX9_ITERATIVE-LABEL: min_i64_varying:
13179; GFX9_ITERATIVE:       ; %bb.0: ; %entry
13180; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
13181; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
13182; GFX9_ITERATIVE-NEXT:    s_brev_b32 s1, -2
13183; GFX9_ITERATIVE-NEXT:    s_mov_b32 s0, -1
13184; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
13185; GFX9_ITERATIVE-NEXT:  .LBB26_1: ; %ComputeLoop
13186; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
13187; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
13188; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
13189; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
13190; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
13191; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
13192; GFX9_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
13193; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
13194; GFX9_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
13195; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
13196; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
13197; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
13198; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
13199; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
13200; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
13201; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
13202; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
13203; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
13204; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13205; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
13206; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
13207; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
13208; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
13209; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
13210; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB26_4
13211; GFX9_ITERATIVE-NEXT:  ; %bb.3:
13212; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
13213; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
13214; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
13215; GFX9_ITERATIVE-NEXT:    ds_min_rtn_i64 v[3:4], v0, v[3:4]
13216; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13217; GFX9_ITERATIVE-NEXT:  .LBB26_4:
13218; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
13219; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
13220; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
13221; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
13222; GFX9_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2]
13223; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
13224; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
13225; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
13226; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
13227; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
13228; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
13229; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13230; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
13231; GFX9_ITERATIVE-NEXT:    s_endpgm
13232;
13233; GFX1064_ITERATIVE-LABEL: min_i64_varying:
13234; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
13235; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
13236; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
13237; GFX1064_ITERATIVE-NEXT:    s_brev_b32 s1, -2
13238; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s0, -1
13239; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
13240; GFX1064_ITERATIVE-NEXT:  .LBB26_1: ; %ComputeLoop
13241; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
13242; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
13243; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
13244; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
13245; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
13246; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
13247; GFX1064_ITERATIVE-NEXT:    v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7]
13248; GFX1064_ITERATIVE-NEXT:    s_and_b64 s[8:9], s[8:9], exec
13249; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
13250; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
13251; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
13252; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
13253; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
13254; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
13255; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
13256; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13257; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
13258; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
13259; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
13260; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
13261; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
13262; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB26_4
13263; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
13264; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
13265; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
13266; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
13267; GFX1064_ITERATIVE-NEXT:    ds_min_rtn_i64 v[3:4], v0, v[3:4]
13268; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13269; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
13270; GFX1064_ITERATIVE-NEXT:  .LBB26_4:
13271; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
13272; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
13273; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
13274; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
13275; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
13276; GFX1064_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
13277; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc
13278; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc
13279; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
13280; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
13281; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13282; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
13283; GFX1064_ITERATIVE-NEXT:    s_endpgm
13284;
13285; GFX1032_ITERATIVE-LABEL: min_i64_varying:
13286; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
13287; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
13288; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
13289; GFX1032_ITERATIVE-NEXT:    s_brev_b32 s1, -2
13290; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, -1
13291; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
13292; GFX1032_ITERATIVE-NEXT:  .LBB26_1: ; %ComputeLoop
13293; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
13294; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
13295; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
13296; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
13297; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
13298; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
13299; GFX1032_ITERATIVE-NEXT:    v_cmp_lt_i64_e64 s8, s[0:1], s[6:7]
13300; GFX1032_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
13301; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
13302; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
13303; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
13304; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
13305; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
13306; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
13307; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
13308; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13309; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
13310; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
13311; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
13312; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
13313; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB26_4
13314; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
13315; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
13316; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
13317; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
13318; GFX1032_ITERATIVE-NEXT:    ds_min_rtn_i64 v[3:4], v0, v[3:4]
13319; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13320; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
13321; GFX1032_ITERATIVE-NEXT:  .LBB26_4:
13322; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
13323; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
13324; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
13325; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
13326; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
13327; GFX1032_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[1:2]
13328; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc_lo
13329; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
13330; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
13331; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
13332; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13333; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
13334; GFX1032_ITERATIVE-NEXT:    s_endpgm
13335;
13336; GFX1164_ITERATIVE-LABEL: min_i64_varying:
13337; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
13338; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
13339; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
13340; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
13341; GFX1164_ITERATIVE-NEXT:    s_brev_b32 s1, -2
13342; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s0, -1
13343; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
13344; GFX1164_ITERATIVE-NEXT:    .p2align 6
13345; GFX1164_ITERATIVE-NEXT:  .LBB26_1: ; %ComputeLoop
13346; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
13347; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
13348; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
13349; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
13350; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
13351; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
13352; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
13353; GFX1164_ITERATIVE-NEXT:    v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7]
13354; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
13355; GFX1164_ITERATIVE-NEXT:    s_and_b64 s[8:9], s[8:9], exec
13356; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
13357; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
13358; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
13359; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
13360; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13361; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
13362; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
13363; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
13364; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
13365; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13366; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
13367; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
13368; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
13369; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
13370; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13371; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
13372; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB26_4
13373; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
13374; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s1
13375; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
13376; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
13377; GFX1164_ITERATIVE-NEXT:    ds_min_rtn_i64 v[2:3], v4, v[2:3]
13378; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13379; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
13380; GFX1164_ITERATIVE-NEXT:  .LBB26_4:
13381; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
13382; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
13383; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
13384; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
13385; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13386; GFX1164_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
13387; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
13388; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
13389; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
13390; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
13391; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13392; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
13393; GFX1164_ITERATIVE-NEXT:    s_endpgm
13394;
13395; GFX1132_ITERATIVE-LABEL: min_i64_varying:
13396; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
13397; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
13398; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
13399; GFX1132_ITERATIVE-NEXT:    s_brev_b32 s1, -2
13400; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, -1
13401; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
13402; GFX1132_ITERATIVE-NEXT:    .p2align 6
13403; GFX1132_ITERATIVE-NEXT:  .LBB26_1: ; %ComputeLoop
13404; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
13405; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
13406; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
13407; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
13408; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
13409; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
13410; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
13411; GFX1132_ITERATIVE-NEXT:    v_cmp_lt_i64_e64 s8, s[0:1], s[6:7]
13412; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
13413; GFX1132_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
13414; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
13415; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
13416; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
13417; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
13418; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13419; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
13420; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
13421; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
13422; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
13423; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
13424; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
13425; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
13426; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
13427; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
13428; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB26_4
13429; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
13430; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1
13431; GFX1132_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
13432; GFX1132_ITERATIVE-NEXT:    ds_min_rtn_i64 v[2:3], v4, v[2:3]
13433; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13434; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
13435; GFX1132_ITERATIVE-NEXT:  .LBB26_4:
13436; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
13437; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
13438; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
13439; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
13440; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13441; GFX1132_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
13442; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
13443; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
13444; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
13445; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
13446; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
13447; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
13448; GFX1132_ITERATIVE-NEXT:    s_endpgm
13449;
13450; GFX7LESS_DPP-LABEL: min_i64_varying:
13451; GFX7LESS_DPP:       ; %bb.0: ; %entry
13452; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
13453; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
13454; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
13455; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
13456; GFX7LESS_DPP-NEXT:    ds_min_rtn_i64 v[0:1], v1, v[0:1]
13457; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
13458; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
13459; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
13460; GFX7LESS_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
13461; GFX7LESS_DPP-NEXT:    s_endpgm
13462;
13463; GFX8_DPP-LABEL: min_i64_varying:
13464; GFX8_DPP:       ; %bb.0: ; %entry
13465; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
13466; GFX8_DPP-NEXT:    v_mov_b32_e32 v9, 0
13467; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
13468; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
13469; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v2, -2
13470; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v4, v2, 0, s[0:1]
13471; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s[0:1]
13472; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13473; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, -1
13474; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, -1
13475; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
13476; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
13477; GFX8_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
13478; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
13479; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
13480; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13481; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, -1
13482; GFX8_DPP-NEXT:    s_nop 0
13483; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
13484; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
13485; GFX8_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
13486; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
13487; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
13488; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13489; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, -1
13490; GFX8_DPP-NEXT:    s_nop 0
13491; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
13492; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
13493; GFX8_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
13494; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
13495; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
13496; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13497; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, -1
13498; GFX8_DPP-NEXT:    s_nop 0
13499; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
13500; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
13501; GFX8_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
13502; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
13503; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
13504; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13505; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, -1
13506; GFX8_DPP-NEXT:    s_nop 0
13507; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
13508; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
13509; GFX8_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
13510; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
13511; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
13512; GFX8_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13513; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, -1
13514; GFX8_DPP-NEXT:    s_nop 0
13515; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
13516; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
13517; GFX8_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
13518; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
13519; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
13520; GFX8_DPP-NEXT:    v_readlane_b32 s3, v4, 63
13521; GFX8_DPP-NEXT:    v_readlane_b32 s2, v3, 63
13522; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
13523; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
13524; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
13525; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
13526; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
13527; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
13528; GFX8_DPP-NEXT:    s_cbranch_execz .LBB26_2
13529; GFX8_DPP-NEXT:  ; %bb.1:
13530; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s3
13531; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s2
13532; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
13533; GFX8_DPP-NEXT:    ds_min_rtn_i64 v[7:8], v9, v[7:8]
13534; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
13535; GFX8_DPP-NEXT:  .LBB26_2:
13536; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
13537; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
13538; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v8
13539; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v7
13540; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
13541; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
13542; GFX8_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
13543; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
13544; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
13545; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
13546; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
13547; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
13548; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
13549; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
13550; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
13551; GFX8_DPP-NEXT:    s_endpgm
13552;
13553; GFX9_DPP-LABEL: min_i64_varying:
13554; GFX9_DPP:       ; %bb.0: ; %entry
13555; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
13556; GFX9_DPP-NEXT:    v_mov_b32_e32 v9, 0
13557; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
13558; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
13559; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v2, -2
13560; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v4, v2, 0, s[0:1]
13561; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, v0, s[0:1]
13562; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13563; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, -1
13564; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, -1
13565; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
13566; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
13567; GFX9_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
13568; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
13569; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
13570; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13571; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, -1
13572; GFX9_DPP-NEXT:    s_nop 0
13573; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
13574; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
13575; GFX9_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
13576; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
13577; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
13578; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13579; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, -1
13580; GFX9_DPP-NEXT:    s_nop 0
13581; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
13582; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
13583; GFX9_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
13584; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
13585; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
13586; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13587; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, -1
13588; GFX9_DPP-NEXT:    s_nop 0
13589; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
13590; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
13591; GFX9_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
13592; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
13593; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
13594; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13595; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, -1
13596; GFX9_DPP-NEXT:    s_nop 0
13597; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
13598; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
13599; GFX9_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
13600; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
13601; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
13602; GFX9_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13603; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, -1
13604; GFX9_DPP-NEXT:    s_nop 0
13605; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
13606; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
13607; GFX9_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
13608; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
13609; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
13610; GFX9_DPP-NEXT:    v_readlane_b32 s3, v4, 63
13611; GFX9_DPP-NEXT:    v_readlane_b32 s2, v3, 63
13612; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
13613; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
13614; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
13615; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
13616; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
13617; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
13618; GFX9_DPP-NEXT:    s_cbranch_execz .LBB26_2
13619; GFX9_DPP-NEXT:  ; %bb.1:
13620; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s3
13621; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s2
13622; GFX9_DPP-NEXT:    ds_min_rtn_i64 v[7:8], v9, v[7:8]
13623; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
13624; GFX9_DPP-NEXT:  .LBB26_2:
13625; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
13626; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
13627; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v8
13628; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v7
13629; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
13630; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
13631; GFX9_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
13632; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
13633; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
13634; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
13635; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
13636; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
13637; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
13638; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
13639; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
13640; GFX9_DPP-NEXT:    s_endpgm
13641;
13642; GFX1064_DPP-LABEL: min_i64_varying:
13643; GFX1064_DPP:       ; %bb.0: ; %entry
13644; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
13645; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fffffff, 0, s[0:1]
13646; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13647; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, -1
13648; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
13649; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13650; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, -1
13651; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
13652; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
13653; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
13654; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
13655; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
13656; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13657; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, -1
13658; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
13659; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
13660; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
13661; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
13662; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
13663; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13664; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, -1
13665; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
13666; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
13667; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
13668; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
13669; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
13670; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13671; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, -1
13672; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
13673; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
13674; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
13675; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
13676; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
13677; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
13678; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
13679; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
13680; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
13681; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
13682; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
13683; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
13684; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13685; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, -1
13686; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 31
13687; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 31
13688; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, s2
13689; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s3
13690; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
13691; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
13692; GFX1064_DPP-NEXT:    v_bfrev_b32_e32 v5, -2
13693; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
13694; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
13695; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, -1
13696; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
13697; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
13698; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13699; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
13700; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
13701; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 15
13702; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
13703; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 15
13704; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 31
13705; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
13706; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s2, 16
13707; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 63
13708; GFX1064_DPP-NEXT:    v_writelane_b32 v4, s3, 16
13709; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v2, 47
13710; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 63
13711; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v1, 47
13712; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s6, 32
13713; GFX1064_DPP-NEXT:    v_writelane_b32 v4, s7, 32
13714; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
13715; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
13716; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
13717; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
13718; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
13719; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s8, 48
13720; GFX1064_DPP-NEXT:    v_writelane_b32 v4, s9, 48
13721; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
13722; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
13723; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
13724; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
13725; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
13726; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB26_2
13727; GFX1064_DPP-NEXT:  ; %bb.1:
13728; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
13729; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
13730; GFX1064_DPP-NEXT:    ds_min_rtn_i64 v[7:8], v0, v[7:8]
13731; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
13732; GFX1064_DPP-NEXT:    buffer_gl0_inv
13733; GFX1064_DPP-NEXT:  .LBB26_2:
13734; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
13735; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
13736; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
13737; GFX1064_DPP-NEXT:    s_mov_b32 null, 0
13738; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s5, v8
13739; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s4, v7
13740; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v4
13741; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
13742; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
13743; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
13744; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
13745; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
13746; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
13747; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
13748; GFX1064_DPP-NEXT:    s_endpgm
13749;
13750; GFX1032_DPP-LABEL: min_i64_varying:
13751; GFX1032_DPP:       ; %bb.0: ; %entry
13752; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
13753; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fffffff, 0, s2
13754; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13755; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, -1
13756; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s2
13757; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13758; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, -1
13759; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
13760; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
13761; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4]
13762; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
13763; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
13764; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13765; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, -1
13766; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
13767; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
13768; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[5:6]
13769; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
13770; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
13771; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13772; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, -1
13773; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
13774; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
13775; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4]
13776; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
13777; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
13778; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13779; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, -1
13780; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
13781; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
13782; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[5:6]
13783; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
13784; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
13785; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
13786; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
13787; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
13788; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
13789; GFX1032_DPP-NEXT:    v_bfrev_b32_e32 v5, -2
13790; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4]
13791; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
13792; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
13793; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, -1
13794; GFX1032_DPP-NEXT:    v_readlane_b32 s3, v2, 15
13795; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
13796; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
13797; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
13798; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
13799; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v1, 15
13800; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
13801; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
13802; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
13803; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
13804; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s3, 16
13805; GFX1032_DPP-NEXT:    v_writelane_b32 v4, s6, 16
13806; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
13807; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
13808; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
13809; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
13810; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
13811; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB26_2
13812; GFX1032_DPP-NEXT:  ; %bb.1:
13813; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, s1
13814; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, s0
13815; GFX1032_DPP-NEXT:    ds_min_rtn_i64 v[7:8], v0, v[7:8]
13816; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
13817; GFX1032_DPP-NEXT:    buffer_gl0_inv
13818; GFX1032_DPP-NEXT:  .LBB26_2:
13819; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
13820; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
13821; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
13822; GFX1032_DPP-NEXT:    s_mov_b32 null, 0
13823; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s5, v8
13824; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s4, v7
13825; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v4
13826; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
13827; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
13828; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[7:8]
13829; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
13830; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
13831; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
13832; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
13833; GFX1032_DPP-NEXT:    s_endpgm
13834;
13835; GFX1164_DPP-LABEL: min_i64_varying:
13836; GFX1164_DPP:       ; %bb.0: ; %entry
13837; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
13838; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
13839; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13840; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fffffff, 0, s[0:1]
13841; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13842; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, -1
13843; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
13844; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13845; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, -1
13846; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
13847; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13848; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
13849; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
13850; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
13851; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
13852; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13853; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, -1
13854; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
13855; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
13856; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
13857; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
13858; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
13859; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
13860; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
13861; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13862; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, -1
13863; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
13864; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13865; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
13866; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
13867; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
13868; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
13869; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13870; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, -1
13871; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
13872; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
13873; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
13874; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
13875; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
13876; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
13877; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
13878; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
13879; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
13880; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
13881; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
13882; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
13883; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
13884; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
13885; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
13886; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
13887; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13888; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, -1
13889; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
13890; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 31
13891; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 31
13892; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
13893; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, s2
13894; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s3
13895; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
13896; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
13897; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
13898; GFX1164_DPP-NEXT:    v_bfrev_b32_e32 v5, -2
13899; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
13900; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
13901; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
13902; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, -1
13903; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
13904; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
13905; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13906; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
13907; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
13908; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 15
13909; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
13910; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 15
13911; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 31
13912; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
13913; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s2, 16
13914; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 63
13915; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s3, 16
13916; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v2, 47
13917; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v2, 63
13918; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v1, 47
13919; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s6, 32
13920; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s7, 32
13921; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
13922; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13923; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
13924; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
13925; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
13926; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
13927; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s8, 48
13928; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s9, 48
13929; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
13930; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
13931; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
13932; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
13933; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
13934; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB26_2
13935; GFX1164_DPP-NEXT:  ; %bb.1:
13936; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
13937; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
13938; GFX1164_DPP-NEXT:    ds_min_rtn_i64 v[7:8], v0, v[7:8]
13939; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
13940; GFX1164_DPP-NEXT:    buffer_gl0_inv
13941; GFX1164_DPP-NEXT:  .LBB26_2:
13942; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
13943; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
13944; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s5, v8
13945; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s4, v7
13946; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v4
13947; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
13948; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
13949; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13950; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
13951; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
13952; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
13953; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
13954; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
13955; GFX1164_DPP-NEXT:    s_endpgm
13956;
13957; GFX1132_DPP-LABEL: min_i64_varying:
13958; GFX1132_DPP:       ; %bb.0: ; %entry
13959; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
13960; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
13961; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
13962; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fffffff, 0, s2
13963; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13964; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s2
13965; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13966; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
13967; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
13968; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, -1
13969; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
13970; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
13971; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4]
13972; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1
13973; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13974; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, -1
13975; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
13976; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, -1
13977; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13978; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
13979; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[5:6]
13980; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1
13981; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v6, -2
13982; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, -1
13983; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
13984; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
13985; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
13986; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
13987; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4]
13988; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1
13989; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v4, -2
13990; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, -1
13991; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
13992; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
13993; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
13994; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[5:6]
13995; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1
13996; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
13997; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
13998; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
13999; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14000; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
14001; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
14002; GFX1132_DPP-NEXT:    v_bfrev_b32_e32 v5, -2
14003; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
14004; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4]
14005; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
14006; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, -1 :: v_dual_cndmask_b32 v1, v3, v1
14007; GFX1132_DPP-NEXT:    v_readlane_b32 s3, v2, 15
14008; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
14009; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
14010; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
14011; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
14012; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
14013; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
14014; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
14015; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
14016; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
14017; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
14018; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s3, 16
14019; GFX1132_DPP-NEXT:    v_writelane_b32 v4, s6, 16
14020; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
14021; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
14022; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
14023; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
14024; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
14025; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB26_2
14026; GFX1132_DPP-NEXT:  ; %bb.1:
14027; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
14028; GFX1132_DPP-NEXT:    ds_min_rtn_i64 v[7:8], v0, v[7:8]
14029; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14030; GFX1132_DPP-NEXT:    buffer_gl0_inv
14031; GFX1132_DPP-NEXT:  .LBB26_2:
14032; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
14033; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
14034; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s5, v8
14035; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
14036; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v4
14037; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
14038; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
14039; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14040; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[7:8]
14041; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
14042; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
14043; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14044; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
14045; GFX1132_DPP-NEXT:    s_endpgm
14046entry:
14047  %lane = call i32 @llvm.amdgcn.workitem.id.x()
14048  %lane_ext = zext i32 %lane to i64
14049  %old = atomicrmw min ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
14050  store i64 %old, ptr addrspace(1) %out
14051  ret void
14052}
14053
14054define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
14055; GFX7LESS_ITERATIVE-LABEL: umax_i32_varying:
14056; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
14057; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
14058; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, 0
14059; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
14060; GFX7LESS_ITERATIVE-NEXT:  .LBB27_1: ; %ComputeLoop
14061; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
14062; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
14063; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s3
14064; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
14065; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
14066; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
14067; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
14068; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
14069; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
14070; GFX7LESS_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
14071; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB27_1
14072; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
14073; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
14074; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
14075; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
14076; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
14077; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14078; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
14079; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB27_4
14080; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
14081; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
14082; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
14083; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
14084; GFX7LESS_ITERATIVE-NEXT:    ds_max_rtn_u32 v0, v0, v2
14085; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14086; GFX7LESS_ITERATIVE-NEXT:  .LBB27_4:
14087; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
14088; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
14089; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
14090; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
14091; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
14092; GFX7LESS_ITERATIVE-NEXT:    v_max_u32_e32 v0, s4, v1
14093; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14094; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
14095; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
14096;
14097; GFX8_ITERATIVE-LABEL: umax_i32_varying:
14098; GFX8_ITERATIVE:       ; %bb.0: ; %entry
14099; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
14100; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, 0
14101; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
14102; GFX8_ITERATIVE-NEXT:  .LBB27_1: ; %ComputeLoop
14103; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
14104; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
14105; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
14106; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
14107; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
14108; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
14109; GFX8_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
14110; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
14111; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
14112; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
14113; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
14114; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14115; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
14116; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
14117; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
14118; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14119; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
14120; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB27_4
14121; GFX8_ITERATIVE-NEXT:  ; %bb.3:
14122; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
14123; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
14124; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
14125; GFX8_ITERATIVE-NEXT:    ds_max_rtn_u32 v0, v0, v2
14126; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14127; GFX8_ITERATIVE-NEXT:  .LBB27_4:
14128; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
14129; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
14130; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
14131; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
14132; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
14133; GFX8_ITERATIVE-NEXT:    v_max_u32_e32 v0, s4, v1
14134; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14135; GFX8_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
14136; GFX8_ITERATIVE-NEXT:    s_endpgm
14137;
14138; GFX9_ITERATIVE-LABEL: umax_i32_varying:
14139; GFX9_ITERATIVE:       ; %bb.0: ; %entry
14140; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
14141; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, 0
14142; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
14143; GFX9_ITERATIVE-NEXT:  .LBB27_1: ; %ComputeLoop
14144; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
14145; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
14146; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
14147; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
14148; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
14149; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
14150; GFX9_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
14151; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
14152; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
14153; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
14154; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
14155; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14156; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
14157; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
14158; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
14159; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14160; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
14161; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB27_4
14162; GFX9_ITERATIVE-NEXT:  ; %bb.3:
14163; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
14164; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
14165; GFX9_ITERATIVE-NEXT:    ds_max_rtn_u32 v0, v0, v2
14166; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14167; GFX9_ITERATIVE-NEXT:  .LBB27_4:
14168; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
14169; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
14170; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
14171; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
14172; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
14173; GFX9_ITERATIVE-NEXT:    v_max_u32_e32 v0, s4, v1
14174; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14175; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
14176; GFX9_ITERATIVE-NEXT:    s_endpgm
14177;
14178; GFX1064_ITERATIVE-LABEL: umax_i32_varying:
14179; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
14180; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
14181; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, 0
14182; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
14183; GFX1064_ITERATIVE-NEXT:  .LBB27_1: ; %ComputeLoop
14184; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
14185; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
14186; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
14187; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
14188; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
14189; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
14190; GFX1064_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
14191; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
14192; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
14193; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
14194; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14195; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
14196; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
14197; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
14198; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14199; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
14200; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB27_4
14201; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
14202; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
14203; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
14204; GFX1064_ITERATIVE-NEXT:    ds_max_rtn_u32 v0, v0, v2
14205; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14206; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
14207; GFX1064_ITERATIVE-NEXT:  .LBB27_4:
14208; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
14209; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
14210; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
14211; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
14212; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
14213; GFX1064_ITERATIVE-NEXT:    v_max_u32_e32 v0, s2, v1
14214; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
14215; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14216; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
14217; GFX1064_ITERATIVE-NEXT:    s_endpgm
14218;
14219; GFX1032_ITERATIVE-LABEL: umax_i32_varying:
14220; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
14221; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
14222; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, 0
14223; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
14224; GFX1032_ITERATIVE-NEXT:  .LBB27_1: ; %ComputeLoop
14225; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
14226; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
14227; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
14228; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
14229; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
14230; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
14231; GFX1032_ITERATIVE-NEXT:    s_max_u32 s0, s0, s3
14232; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
14233; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
14234; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
14235; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14236; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
14237; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
14238; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
14239; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
14240; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB27_4
14241; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
14242; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
14243; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
14244; GFX1032_ITERATIVE-NEXT:    ds_max_rtn_u32 v0, v0, v2
14245; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14246; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
14247; GFX1032_ITERATIVE-NEXT:  .LBB27_4:
14248; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
14249; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14250; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
14251; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
14252; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
14253; GFX1032_ITERATIVE-NEXT:    v_max_u32_e32 v0, s2, v1
14254; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
14255; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14256; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
14257; GFX1032_ITERATIVE-NEXT:    s_endpgm
14258;
14259; GFX1164_ITERATIVE-LABEL: umax_i32_varying:
14260; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
14261; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
14262; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
14263; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, 0
14264; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
14265; GFX1164_ITERATIVE-NEXT:  .LBB27_1: ; %ComputeLoop
14266; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
14267; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
14268; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
14269; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
14270; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
14271; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
14272; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
14273; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
14274; GFX1164_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
14275; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
14276; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
14277; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
14278; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
14279; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14280; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
14281; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
14282; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
14283; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14284; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
14285; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
14286; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB27_4
14287; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
14288; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
14289; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
14290; GFX1164_ITERATIVE-NEXT:    ds_max_rtn_u32 v1, v1, v2
14291; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14292; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
14293; GFX1164_ITERATIVE-NEXT:  .LBB27_4:
14294; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
14295; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
14296; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
14297; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
14298; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14299; GFX1164_ITERATIVE-NEXT:    v_max_u32_e32 v0, s2, v0
14300; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
14301; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14302; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
14303; GFX1164_ITERATIVE-NEXT:    s_endpgm
14304;
14305; GFX1132_ITERATIVE-LABEL: umax_i32_varying:
14306; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
14307; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
14308; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
14309; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, 0
14310; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
14311; GFX1132_ITERATIVE-NEXT:  .LBB27_1: ; %ComputeLoop
14312; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
14313; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
14314; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
14315; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
14316; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
14317; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
14318; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
14319; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
14320; GFX1132_ITERATIVE-NEXT:    s_max_u32 s0, s0, s3
14321; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
14322; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
14323; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
14324; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
14325; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
14326; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
14327; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
14328; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
14329; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
14330; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB27_4
14331; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
14332; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
14333; GFX1132_ITERATIVE-NEXT:    ds_max_rtn_u32 v1, v1, v2
14334; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14335; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
14336; GFX1132_ITERATIVE-NEXT:  .LBB27_4:
14337; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14338; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
14339; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
14340; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
14341; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14342; GFX1132_ITERATIVE-NEXT:    v_max_u32_e32 v0, s2, v0
14343; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
14344; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14345; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
14346; GFX1132_ITERATIVE-NEXT:    s_endpgm
14347;
14348; GFX7LESS_DPP-LABEL: umax_i32_varying:
14349; GFX7LESS_DPP:       ; %bb.0: ; %entry
14350; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
14351; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
14352; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
14353; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14354; GFX7LESS_DPP-NEXT:    ds_max_rtn_u32 v0, v1, v0
14355; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14356; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
14357; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
14358; GFX7LESS_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
14359; GFX7LESS_DPP-NEXT:    s_endpgm
14360;
14361; GFX8_DPP-LABEL: umax_i32_varying:
14362; GFX8_DPP:       ; %bb.0: ; %entry
14363; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
14364; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
14365; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
14366; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
14367; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
14368; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
14369; GFX8_DPP-NEXT:    s_nop 0
14370; GFX8_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
14371; GFX8_DPP-NEXT:    s_nop 1
14372; GFX8_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
14373; GFX8_DPP-NEXT:    s_nop 1
14374; GFX8_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
14375; GFX8_DPP-NEXT:    s_nop 1
14376; GFX8_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
14377; GFX8_DPP-NEXT:    s_nop 1
14378; GFX8_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
14379; GFX8_DPP-NEXT:    s_nop 1
14380; GFX8_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
14381; GFX8_DPP-NEXT:    v_readlane_b32 s2, v1, 63
14382; GFX8_DPP-NEXT:    s_nop 0
14383; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
14384; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
14385; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
14386; GFX8_DPP-NEXT:    ; implicit-def: $vgpr0
14387; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14388; GFX8_DPP-NEXT:    s_cbranch_execz .LBB27_2
14389; GFX8_DPP-NEXT:  ; %bb.1:
14390; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s2
14391; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
14392; GFX8_DPP-NEXT:    ds_max_rtn_u32 v0, v3, v0
14393; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14394; GFX8_DPP-NEXT:  .LBB27_2:
14395; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
14396; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
14397; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v0
14398; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, v2
14399; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
14400; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
14401; GFX8_DPP-NEXT:    v_max_u32_e32 v0, s4, v0
14402; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14403; GFX8_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
14404; GFX8_DPP-NEXT:    s_endpgm
14405;
14406; GFX9_DPP-LABEL: umax_i32_varying:
14407; GFX9_DPP:       ; %bb.0: ; %entry
14408; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
14409; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
14410; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
14411; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
14412; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
14413; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
14414; GFX9_DPP-NEXT:    s_nop 0
14415; GFX9_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
14416; GFX9_DPP-NEXT:    s_nop 1
14417; GFX9_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
14418; GFX9_DPP-NEXT:    s_nop 1
14419; GFX9_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
14420; GFX9_DPP-NEXT:    s_nop 1
14421; GFX9_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
14422; GFX9_DPP-NEXT:    s_nop 1
14423; GFX9_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
14424; GFX9_DPP-NEXT:    s_nop 1
14425; GFX9_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
14426; GFX9_DPP-NEXT:    v_readlane_b32 s2, v1, 63
14427; GFX9_DPP-NEXT:    s_nop 0
14428; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
14429; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
14430; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
14431; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
14432; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14433; GFX9_DPP-NEXT:    s_cbranch_execz .LBB27_2
14434; GFX9_DPP-NEXT:  ; %bb.1:
14435; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s2
14436; GFX9_DPP-NEXT:    ds_max_rtn_u32 v0, v3, v0
14437; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14438; GFX9_DPP-NEXT:  .LBB27_2:
14439; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
14440; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
14441; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
14442; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v2
14443; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
14444; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
14445; GFX9_DPP-NEXT:    v_max_u32_e32 v0, s4, v0
14446; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14447; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
14448; GFX9_DPP-NEXT:    s_endpgm
14449;
14450; GFX1064_DPP-LABEL: umax_i32_varying:
14451; GFX1064_DPP:       ; %bb.0: ; %entry
14452; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
14453; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
14454; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
14455; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
14456; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
14457; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
14458; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
14459; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
14460; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
14461; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
14462; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s2
14463; GFX1064_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
14464; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
14465; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 15
14466; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 31
14467; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 16
14468; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
14469; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14470; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
14471; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 47
14472; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 63
14473; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s3, 32
14474; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
14475; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
14476; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
14477; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
14478; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 48
14479; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
14480; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
14481; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
14482; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
14483; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14484; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB27_2
14485; GFX1064_DPP-NEXT:  ; %bb.1:
14486; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s6
14487; GFX1064_DPP-NEXT:    s_mov_b32 s3, s6
14488; GFX1064_DPP-NEXT:    ds_max_rtn_u32 v0, v4, v0
14489; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14490; GFX1064_DPP-NEXT:    buffer_gl0_inv
14491; GFX1064_DPP-NEXT:  .LBB27_2:
14492; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
14493; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
14494; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
14495; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v0
14496; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
14497; GFX1064_DPP-NEXT:    v_max_u32_e32 v0, s3, v0
14498; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
14499; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14500; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
14501; GFX1064_DPP-NEXT:    s_endpgm
14502;
14503; GFX1032_DPP-LABEL: umax_i32_varying:
14504; GFX1032_DPP:       ; %bb.0: ; %entry
14505; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
14506; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
14507; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
14508; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
14509; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
14510; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
14511; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
14512; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
14513; GFX1032_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
14514; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
14515; GFX1032_DPP-NEXT:    v_readlane_b32 s2, v1, 31
14516; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
14517; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
14518; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14519; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
14520; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
14521; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
14522; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
14523; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
14524; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
14525; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
14526; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
14527; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
14528; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB27_2
14529; GFX1032_DPP-NEXT:  ; %bb.1:
14530; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s0
14531; GFX1032_DPP-NEXT:    ds_max_rtn_u32 v0, v4, v0
14532; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14533; GFX1032_DPP-NEXT:    buffer_gl0_inv
14534; GFX1032_DPP-NEXT:  .LBB27_2:
14535; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
14536; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14537; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
14538; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v0
14539; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
14540; GFX1032_DPP-NEXT:    v_max_u32_e32 v0, s3, v0
14541; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
14542; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14543; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
14544; GFX1032_DPP-NEXT:    s_endpgm
14545;
14546; GFX1164_DPP-LABEL: umax_i32_varying:
14547; GFX1164_DPP:       ; %bb.0: ; %entry
14548; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
14549; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
14550; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
14551; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
14552; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
14553; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
14554; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
14555; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
14556; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14557; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
14558; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
14559; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14560; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
14561; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
14562; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14563; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
14564; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s2
14565; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14566; GFX1164_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
14567; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
14568; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 15
14569; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 31
14570; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
14571; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 16
14572; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
14573; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14574; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
14575; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 47
14576; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 63
14577; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s3, 32
14578; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
14579; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
14580; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
14581; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
14582; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
14583; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 48
14584; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
14585; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
14586; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
14587; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr0
14588; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14589; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB27_2
14590; GFX1164_DPP-NEXT:  ; %bb.1:
14591; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, s6
14592; GFX1164_DPP-NEXT:    s_mov_b32 s3, s6
14593; GFX1164_DPP-NEXT:    ds_max_rtn_u32 v0, v4, v0
14594; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14595; GFX1164_DPP-NEXT:    buffer_gl0_inv
14596; GFX1164_DPP-NEXT:  .LBB27_2:
14597; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
14598; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
14599; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v0
14600; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, v3
14601; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14602; GFX1164_DPP-NEXT:    v_max_u32_e32 v0, s3, v0
14603; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
14604; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14605; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
14606; GFX1164_DPP-NEXT:    s_endpgm
14607;
14608; GFX1132_DPP-LABEL: umax_i32_varying:
14609; GFX1132_DPP:       ; %bb.0: ; %entry
14610; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
14611; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
14612; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
14613; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s0
14614; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
14615; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
14616; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
14617; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
14618; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14619; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
14620; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
14621; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14622; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
14623; GFX1132_DPP-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
14624; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
14625; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
14626; GFX1132_DPP-NEXT:    v_readlane_b32 s2, v1, 31
14627; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
14628; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
14629; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14630; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
14631; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
14632; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
14633; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
14634; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
14635; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
14636; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
14637; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
14638; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
14639; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB27_2
14640; GFX1132_DPP-NEXT:  ; %bb.1:
14641; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, s0
14642; GFX1132_DPP-NEXT:    ds_max_rtn_u32 v0, v4, v0
14643; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14644; GFX1132_DPP-NEXT:    buffer_gl0_inv
14645; GFX1132_DPP-NEXT:  .LBB27_2:
14646; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14647; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
14648; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
14649; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
14650; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14651; GFX1132_DPP-NEXT:    v_max_u32_e32 v0, s3, v0
14652; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
14653; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
14654; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
14655; GFX1132_DPP-NEXT:    s_endpgm
14656entry:
14657  %lane = call i32 @llvm.amdgcn.workitem.id.x()
14658  %old = atomicrmw umax ptr addrspace(3) @local_var32, i32 %lane acq_rel
14659  store i32 %old, ptr addrspace(1) %out
14660  ret void
14661}
14662
14663define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
14664; GFX7LESS-LABEL: umax_i64_constant:
14665; GFX7LESS:       ; %bb.0: ; %entry
14666; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
14667; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
14668; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
14669; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
14670; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14671; GFX7LESS-NEXT:    s_cbranch_execz .LBB28_2
14672; GFX7LESS-NEXT:  ; %bb.1:
14673; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
14674; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
14675; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
14676; GFX7LESS-NEXT:    s_mov_b32 m0, -1
14677; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
14678; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
14679; GFX7LESS-NEXT:  .LBB28_2:
14680; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
14681; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
14682; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
14683; GFX7LESS-NEXT:    s_mov_b32 s2, -1
14684; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
14685; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
14686; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
14687; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
14688; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
14689; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
14690; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
14691; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
14692; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
14693; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
14694; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
14695; GFX7LESS-NEXT:    s_endpgm
14696;
14697; GFX8-LABEL: umax_i64_constant:
14698; GFX8:       ; %bb.0: ; %entry
14699; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14700; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
14701; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
14702; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
14703; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14704; GFX8-NEXT:    s_cbranch_execz .LBB28_2
14705; GFX8-NEXT:  ; %bb.1:
14706; GFX8-NEXT:    v_mov_b32_e32 v0, 5
14707; GFX8-NEXT:    v_mov_b32_e32 v1, 0
14708; GFX8-NEXT:    v_mov_b32_e32 v2, 0
14709; GFX8-NEXT:    s_mov_b32 m0, -1
14710; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
14711; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
14712; GFX8-NEXT:  .LBB28_2:
14713; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
14714; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
14715; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
14716; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
14717; GFX8-NEXT:    v_mov_b32_e32 v1, 0
14718; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
14719; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
14720; GFX8-NEXT:    v_mov_b32_e32 v2, s4
14721; GFX8-NEXT:    v_mov_b32_e32 v1, s5
14722; GFX8-NEXT:    s_mov_b32 s3, 0xf000
14723; GFX8-NEXT:    s_mov_b32 s2, -1
14724; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
14725; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
14726; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
14727; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
14728; GFX8-NEXT:    s_endpgm
14729;
14730; GFX9-LABEL: umax_i64_constant:
14731; GFX9:       ; %bb.0: ; %entry
14732; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14733; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
14734; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
14735; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
14736; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14737; GFX9-NEXT:    s_cbranch_execz .LBB28_2
14738; GFX9-NEXT:  ; %bb.1:
14739; GFX9-NEXT:    v_mov_b32_e32 v0, 5
14740; GFX9-NEXT:    v_mov_b32_e32 v1, 0
14741; GFX9-NEXT:    v_mov_b32_e32 v2, 0
14742; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
14743; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14744; GFX9-NEXT:  .LBB28_2:
14745; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
14746; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
14747; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
14748; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
14749; GFX9-NEXT:    v_mov_b32_e32 v1, 0
14750; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
14751; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
14752; GFX9-NEXT:    v_mov_b32_e32 v2, s4
14753; GFX9-NEXT:    v_mov_b32_e32 v1, s5
14754; GFX9-NEXT:    s_mov_b32 s3, 0xf000
14755; GFX9-NEXT:    s_mov_b32 s2, -1
14756; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
14757; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
14758; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14759; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
14760; GFX9-NEXT:    s_endpgm
14761;
14762; GFX1064-LABEL: umax_i64_constant:
14763; GFX1064:       ; %bb.0: ; %entry
14764; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14765; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
14766; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
14767; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
14768; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14769; GFX1064-NEXT:    s_cbranch_execz .LBB28_2
14770; GFX1064-NEXT:  ; %bb.1:
14771; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
14772; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
14773; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
14774; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
14775; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
14776; GFX1064-NEXT:    buffer_gl0_inv
14777; GFX1064-NEXT:  .LBB28_2:
14778; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
14779; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
14780; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
14781; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
14782; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
14783; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
14784; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
14785; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
14786; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
14787; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
14788; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
14789; GFX1064-NEXT:    s_mov_b32 s2, -1
14790; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
14791; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
14792; GFX1064-NEXT:    s_endpgm
14793;
14794; GFX1032-LABEL: umax_i64_constant:
14795; GFX1032:       ; %bb.0: ; %entry
14796; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14797; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
14798; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
14799; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
14800; GFX1032-NEXT:    s_cbranch_execz .LBB28_2
14801; GFX1032-NEXT:  ; %bb.1:
14802; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
14803; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
14804; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
14805; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
14806; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
14807; GFX1032-NEXT:    buffer_gl0_inv
14808; GFX1032-NEXT:  .LBB28_2:
14809; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
14810; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
14811; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
14812; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
14813; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
14814; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
14815; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
14816; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
14817; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
14818; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
14819; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
14820; GFX1032-NEXT:    s_mov_b32 s2, -1
14821; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
14822; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
14823; GFX1032-NEXT:    s_endpgm
14824;
14825; GFX1164-LABEL: umax_i64_constant:
14826; GFX1164:       ; %bb.0: ; %entry
14827; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14828; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14829; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
14830; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
14831; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
14832; GFX1164-NEXT:    s_and_saveexec_b64 s[0:1], vcc
14833; GFX1164-NEXT:    s_cbranch_execz .LBB28_2
14834; GFX1164-NEXT:  ; %bb.1:
14835; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
14836; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
14837; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
14838; GFX1164-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
14839; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
14840; GFX1164-NEXT:    buffer_gl0_inv
14841; GFX1164-NEXT:  .LBB28_2:
14842; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
14843; GFX1164-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
14844; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
14845; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
14846; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
14847; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
14848; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14849; GFX1164-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
14850; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
14851; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
14852; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
14853; GFX1164-NEXT:    s_mov_b32 s2, -1
14854; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
14855; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
14856; GFX1164-NEXT:    s_endpgm
14857;
14858; GFX1132-LABEL: umax_i64_constant:
14859; GFX1132:       ; %bb.0: ; %entry
14860; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14861; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14862; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
14863; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
14864; GFX1132-NEXT:    s_and_saveexec_b32 s0, vcc_lo
14865; GFX1132-NEXT:    s_cbranch_execz .LBB28_2
14866; GFX1132-NEXT:  ; %bb.1:
14867; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
14868; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
14869; GFX1132-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
14870; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
14871; GFX1132-NEXT:    buffer_gl0_inv
14872; GFX1132-NEXT:  .LBB28_2:
14873; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s0
14874; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
14875; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
14876; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
14877; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
14878; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
14879; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14880; GFX1132-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
14881; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
14882; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
14883; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
14884; GFX1132-NEXT:    s_mov_b32 s2, -1
14885; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
14886; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
14887; GFX1132-NEXT:    s_endpgm
14888entry:
14889  %old = atomicrmw umax ptr addrspace(3) @local_var64, i64 5 acq_rel
14890  store i64 %old, ptr addrspace(1) %out
14891  ret void
14892}
14893
14894define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
14895; GFX7LESS_ITERATIVE-LABEL: umax_i64_varying:
14896; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
14897; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
14898; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
14899; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
14900; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
14901; GFX7LESS_ITERATIVE-NEXT:  .LBB29_1: ; %ComputeLoop
14902; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
14903; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
14904; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s8
14905; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
14906; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
14907; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
14908; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
14909; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
14910; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
14911; GFX7LESS_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
14912; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
14913; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
14914; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
14915; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
14916; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
14917; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
14918; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
14919; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB29_1
14920; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
14921; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
14922; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
14923; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
14924; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
14925; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
14926; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
14927; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB29_4
14928; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
14929; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
14930; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
14931; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
14932; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
14933; GFX7LESS_ITERATIVE-NEXT:    ds_max_rtn_u64 v[3:4], v0, v[3:4]
14934; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14935; GFX7LESS_ITERATIVE-NEXT:  .LBB29_4:
14936; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
14937; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
14938; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
14939; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
14940; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
14941; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
14942; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
14943; GFX7LESS_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2]
14944; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
14945; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
14946; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
14947; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14948; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
14949; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
14950;
14951; GFX8_ITERATIVE-LABEL: umax_i64_varying:
14952; GFX8_ITERATIVE:       ; %bb.0: ; %entry
14953; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
14954; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
14955; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
14956; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
14957; GFX8_ITERATIVE-NEXT:  .LBB29_1: ; %ComputeLoop
14958; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
14959; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
14960; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
14961; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
14962; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
14963; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
14964; GFX8_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
14965; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
14966; GFX8_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
14967; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
14968; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
14969; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
14970; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
14971; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
14972; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
14973; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
14974; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
14975; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
14976; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14977; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
14978; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
14979; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
14980; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
14981; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
14982; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB29_4
14983; GFX8_ITERATIVE-NEXT:  ; %bb.3:
14984; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
14985; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
14986; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
14987; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
14988; GFX8_ITERATIVE-NEXT:    ds_max_rtn_u64 v[3:4], v0, v[3:4]
14989; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
14990; GFX8_ITERATIVE-NEXT:  .LBB29_4:
14991; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
14992; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
14993; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
14994; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
14995; GFX8_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2]
14996; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
14997; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
14998; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
14999; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
15000; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
15001; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
15002; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15003; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
15004; GFX8_ITERATIVE-NEXT:    s_endpgm
15005;
15006; GFX9_ITERATIVE-LABEL: umax_i64_varying:
15007; GFX9_ITERATIVE:       ; %bb.0: ; %entry
15008; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
15009; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
15010; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
15011; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
15012; GFX9_ITERATIVE-NEXT:  .LBB29_1: ; %ComputeLoop
15013; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
15014; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
15015; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
15016; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
15017; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
15018; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
15019; GFX9_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
15020; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
15021; GFX9_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
15022; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
15023; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
15024; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
15025; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
15026; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
15027; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
15028; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
15029; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
15030; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
15031; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
15032; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
15033; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
15034; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
15035; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
15036; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
15037; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB29_4
15038; GFX9_ITERATIVE-NEXT:  ; %bb.3:
15039; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
15040; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
15041; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
15042; GFX9_ITERATIVE-NEXT:    ds_max_rtn_u64 v[3:4], v0, v[3:4]
15043; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15044; GFX9_ITERATIVE-NEXT:  .LBB29_4:
15045; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
15046; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
15047; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
15048; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
15049; GFX9_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2]
15050; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
15051; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
15052; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
15053; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
15054; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
15055; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
15056; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15057; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
15058; GFX9_ITERATIVE-NEXT:    s_endpgm
15059;
15060; GFX1064_ITERATIVE-LABEL: umax_i64_varying:
15061; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
15062; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
15063; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
15064; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
15065; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
15066; GFX1064_ITERATIVE-NEXT:  .LBB29_1: ; %ComputeLoop
15067; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
15068; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
15069; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
15070; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
15071; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
15072; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
15073; GFX1064_ITERATIVE-NEXT:    v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7]
15074; GFX1064_ITERATIVE-NEXT:    s_and_b64 s[8:9], s[8:9], exec
15075; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
15076; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
15077; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
15078; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
15079; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
15080; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
15081; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
15082; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
15083; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
15084; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
15085; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
15086; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
15087; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
15088; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB29_4
15089; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
15090; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
15091; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
15092; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
15093; GFX1064_ITERATIVE-NEXT:    ds_max_rtn_u64 v[3:4], v0, v[3:4]
15094; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15095; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
15096; GFX1064_ITERATIVE-NEXT:  .LBB29_4:
15097; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
15098; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
15099; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
15100; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
15101; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
15102; GFX1064_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[1:2]
15103; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc
15104; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc
15105; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
15106; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
15107; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15108; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
15109; GFX1064_ITERATIVE-NEXT:    s_endpgm
15110;
15111; GFX1032_ITERATIVE-LABEL: umax_i64_varying:
15112; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
15113; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
15114; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
15115; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
15116; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
15117; GFX1032_ITERATIVE-NEXT:  .LBB29_1: ; %ComputeLoop
15118; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
15119; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
15120; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
15121; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
15122; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
15123; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
15124; GFX1032_ITERATIVE-NEXT:    v_cmp_gt_u64_e64 s8, s[0:1], s[6:7]
15125; GFX1032_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
15126; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
15127; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
15128; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
15129; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
15130; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
15131; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
15132; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
15133; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
15134; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
15135; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
15136; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
15137; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
15138; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB29_4
15139; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
15140; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
15141; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
15142; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
15143; GFX1032_ITERATIVE-NEXT:    ds_max_rtn_u64 v[3:4], v0, v[3:4]
15144; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15145; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
15146; GFX1032_ITERATIVE-NEXT:  .LBB29_4:
15147; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
15148; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
15149; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
15150; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
15151; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
15152; GFX1032_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[1:2]
15153; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc_lo
15154; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
15155; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
15156; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
15157; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15158; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
15159; GFX1032_ITERATIVE-NEXT:    s_endpgm
15160;
15161; GFX1164_ITERATIVE-LABEL: umax_i64_varying:
15162; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
15163; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
15164; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
15165; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
15166; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
15167; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
15168; GFX1164_ITERATIVE-NEXT:    .p2align 6
15169; GFX1164_ITERATIVE-NEXT:  .LBB29_1: ; %ComputeLoop
15170; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
15171; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
15172; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
15173; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
15174; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
15175; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
15176; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
15177; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
15178; GFX1164_ITERATIVE-NEXT:    v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7]
15179; GFX1164_ITERATIVE-NEXT:    s_and_b64 s[8:9], s[8:9], exec
15180; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
15181; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
15182; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
15183; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
15184; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
15185; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
15186; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
15187; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
15188; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
15189; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
15190; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
15191; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
15192; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
15193; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
15194; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15195; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
15196; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB29_4
15197; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
15198; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s1
15199; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
15200; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
15201; GFX1164_ITERATIVE-NEXT:    ds_max_rtn_u64 v[2:3], v4, v[2:3]
15202; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15203; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
15204; GFX1164_ITERATIVE-NEXT:  .LBB29_4:
15205; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
15206; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
15207; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
15208; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
15209; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15210; GFX1164_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
15211; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
15212; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
15213; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
15214; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
15215; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15216; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
15217; GFX1164_ITERATIVE-NEXT:    s_endpgm
15218;
15219; GFX1132_ITERATIVE-LABEL: umax_i64_varying:
15220; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
15221; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
15222; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
15223; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
15224; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
15225; GFX1132_ITERATIVE-NEXT:    .p2align 6
15226; GFX1132_ITERATIVE-NEXT:  .LBB29_1: ; %ComputeLoop
15227; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
15228; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
15229; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
15230; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
15231; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
15232; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
15233; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
15234; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
15235; GFX1132_ITERATIVE-NEXT:    v_cmp_gt_u64_e64 s8, s[0:1], s[6:7]
15236; GFX1132_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
15237; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
15238; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
15239; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
15240; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
15241; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
15242; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
15243; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
15244; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
15245; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
15246; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
15247; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
15248; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
15249; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
15250; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
15251; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB29_4
15252; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
15253; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1
15254; GFX1132_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
15255; GFX1132_ITERATIVE-NEXT:    ds_max_rtn_u64 v[2:3], v4, v[2:3]
15256; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15257; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
15258; GFX1132_ITERATIVE-NEXT:  .LBB29_4:
15259; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
15260; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
15261; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
15262; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
15263; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15264; GFX1132_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
15265; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
15266; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
15267; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
15268; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
15269; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15270; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
15271; GFX1132_ITERATIVE-NEXT:    s_endpgm
15272;
15273; GFX7LESS_DPP-LABEL: umax_i64_varying:
15274; GFX7LESS_DPP:       ; %bb.0: ; %entry
15275; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
15276; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
15277; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
15278; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15279; GFX7LESS_DPP-NEXT:    ds_max_rtn_u64 v[0:1], v1, v[0:1]
15280; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15281; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
15282; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
15283; GFX7LESS_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
15284; GFX7LESS_DPP-NEXT:    s_endpgm
15285;
15286; GFX8_DPP-LABEL: umax_i64_varying:
15287; GFX8_DPP:       ; %bb.0: ; %entry
15288; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
15289; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, 0
15290; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
15291; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
15292; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s[0:1]
15293; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
15294; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
15295; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
15296; GFX8_DPP-NEXT:    s_nop 0
15297; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
15298; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
15299; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15300; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15301; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15302; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
15303; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
15304; GFX8_DPP-NEXT:    s_nop 0
15305; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
15306; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf
15307; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15308; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15309; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15310; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
15311; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
15312; GFX8_DPP-NEXT:    s_nop 0
15313; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
15314; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
15315; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15316; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15317; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15318; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
15319; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
15320; GFX8_DPP-NEXT:    s_nop 0
15321; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
15322; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf
15323; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15324; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15325; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15326; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
15327; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
15328; GFX8_DPP-NEXT:    s_nop 0
15329; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
15330; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
15331; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15332; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15333; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15334; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
15335; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
15336; GFX8_DPP-NEXT:    s_nop 0
15337; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
15338; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
15339; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15340; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15341; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15342; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
15343; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
15344; GFX8_DPP-NEXT:    v_readlane_b32 s3, v2, 63
15345; GFX8_DPP-NEXT:    v_readlane_b32 s2, v1, 63
15346; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
15347; GFX8_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
15348; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
15349; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
15350; GFX8_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
15351; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
15352; GFX8_DPP-NEXT:    s_cbranch_execz .LBB29_2
15353; GFX8_DPP-NEXT:  ; %bb.1:
15354; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s3
15355; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, s2
15356; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
15357; GFX8_DPP-NEXT:    ds_max_rtn_u64 v[5:6], v7, v[5:6]
15358; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15359; GFX8_DPP-NEXT:  .LBB29_2:
15360; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
15361; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
15362; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v6
15363; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v5
15364; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
15365; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
15366; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6]
15367; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
15368; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
15369; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
15370; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
15371; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
15372; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc
15373; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15374; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
15375; GFX8_DPP-NEXT:    s_endpgm
15376;
15377; GFX9_DPP-LABEL: umax_i64_varying:
15378; GFX9_DPP:       ; %bb.0: ; %entry
15379; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v5, exec_lo, 0
15380; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, 0
15381; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v5, exec_hi, v5
15382; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
15383; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s[0:1]
15384; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
15385; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
15386; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
15387; GFX9_DPP-NEXT:    s_nop 0
15388; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
15389; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
15390; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15391; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15392; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15393; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
15394; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
15395; GFX9_DPP-NEXT:    s_nop 0
15396; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
15397; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf
15398; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15399; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15400; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15401; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
15402; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
15403; GFX9_DPP-NEXT:    s_nop 0
15404; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
15405; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
15406; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15407; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15408; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15409; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
15410; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
15411; GFX9_DPP-NEXT:    s_nop 0
15412; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
15413; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf
15414; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15415; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15416; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15417; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
15418; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
15419; GFX9_DPP-NEXT:    s_nop 0
15420; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
15421; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
15422; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15423; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15424; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15425; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
15426; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
15427; GFX9_DPP-NEXT:    s_nop 0
15428; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
15429; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
15430; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15431; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15432; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15433; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
15434; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
15435; GFX9_DPP-NEXT:    v_readlane_b32 s3, v2, 63
15436; GFX9_DPP-NEXT:    v_readlane_b32 s2, v1, 63
15437; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
15438; GFX9_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
15439; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
15440; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
15441; GFX9_DPP-NEXT:    ; implicit-def: $vgpr5_vgpr6
15442; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
15443; GFX9_DPP-NEXT:    s_cbranch_execz .LBB29_2
15444; GFX9_DPP-NEXT:  ; %bb.1:
15445; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s3
15446; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, s2
15447; GFX9_DPP-NEXT:    ds_max_rtn_u64 v[5:6], v7, v[5:6]
15448; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15449; GFX9_DPP-NEXT:  .LBB29_2:
15450; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
15451; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
15452; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v6
15453; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v5
15454; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
15455; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
15456; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6]
15457; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
15458; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
15459; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
15460; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
15461; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
15462; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc
15463; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15464; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
15465; GFX9_DPP-NEXT:    s_endpgm
15466;
15467; GFX1064_DPP-LABEL: umax_i64_varying:
15468; GFX1064_DPP:       ; %bb.0: ; %entry
15469; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
15470; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s[0:1]
15471; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
15472; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
15473; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
15474; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
15475; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
15476; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
15477; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
15478; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15479; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15480; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15481; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
15482; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
15483; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
15484; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
15485; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
15486; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
15487; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
15488; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, 0
15489; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
15490; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
15491; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
15492; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15493; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15494; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15495; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
15496; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
15497; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
15498; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
15499; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
15500; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
15501; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
15502; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
15503; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
15504; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
15505; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
15506; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15507; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15508; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15509; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
15510; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
15511; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 31
15512; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 31
15513; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, s2
15514; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s3
15515; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
15516; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
15517; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, 0
15518; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15519; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15520; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
15521; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15522; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
15523; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
15524; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
15525; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
15526; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 15
15527; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
15528; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 15
15529; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 31
15530; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
15531; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s2, 16
15532; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 63
15533; GFX1064_DPP-NEXT:    v_writelane_b32 v4, s3, 16
15534; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v2, 47
15535; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 63
15536; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v1, 47
15537; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s6, 32
15538; GFX1064_DPP-NEXT:    v_writelane_b32 v4, s7, 32
15539; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
15540; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
15541; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
15542; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
15543; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
15544; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s8, 48
15545; GFX1064_DPP-NEXT:    v_writelane_b32 v4, s9, 48
15546; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
15547; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
15548; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
15549; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
15550; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
15551; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB29_2
15552; GFX1064_DPP-NEXT:  ; %bb.1:
15553; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
15554; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
15555; GFX1064_DPP-NEXT:    ds_max_rtn_u64 v[7:8], v0, v[7:8]
15556; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15557; GFX1064_DPP-NEXT:    buffer_gl0_inv
15558; GFX1064_DPP-NEXT:  .LBB29_2:
15559; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
15560; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
15561; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
15562; GFX1064_DPP-NEXT:    s_mov_b32 null, 0
15563; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s5, v8
15564; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s4, v7
15565; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v4
15566; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
15567; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
15568; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
15569; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
15570; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
15571; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15572; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
15573; GFX1064_DPP-NEXT:    s_endpgm
15574;
15575; GFX1032_DPP-LABEL: umax_i64_varying:
15576; GFX1032_DPP:       ; %bb.0: ; %entry
15577; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
15578; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s2
15579; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
15580; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
15581; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s2
15582; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
15583; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
15584; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
15585; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
15586; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4]
15587; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
15588; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
15589; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
15590; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
15591; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
15592; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
15593; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[5:6]
15594; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
15595; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
15596; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, 0
15597; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
15598; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
15599; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
15600; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4]
15601; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
15602; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
15603; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
15604; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
15605; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
15606; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
15607; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[5:6]
15608; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
15609; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
15610; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
15611; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
15612; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
15613; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
15614; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, 0
15615; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4]
15616; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
15617; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
15618; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
15619; GFX1032_DPP-NEXT:    v_readlane_b32 s3, v2, 15
15620; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
15621; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
15622; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
15623; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
15624; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v1, 15
15625; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
15626; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
15627; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
15628; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
15629; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s3, 16
15630; GFX1032_DPP-NEXT:    v_writelane_b32 v4, s6, 16
15631; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
15632; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
15633; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
15634; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
15635; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
15636; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB29_2
15637; GFX1032_DPP-NEXT:  ; %bb.1:
15638; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, s1
15639; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, s0
15640; GFX1032_DPP-NEXT:    ds_max_rtn_u64 v[7:8], v0, v[7:8]
15641; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15642; GFX1032_DPP-NEXT:    buffer_gl0_inv
15643; GFX1032_DPP-NEXT:  .LBB29_2:
15644; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
15645; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
15646; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
15647; GFX1032_DPP-NEXT:    s_mov_b32 null, 0
15648; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s5, v8
15649; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s4, v7
15650; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v4
15651; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
15652; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
15653; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[7:8]
15654; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
15655; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
15656; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15657; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
15658; GFX1032_DPP-NEXT:    s_endpgm
15659;
15660; GFX1164_DPP-LABEL: umax_i64_varying:
15661; GFX1164_DPP:       ; %bb.0: ; %entry
15662; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
15663; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
15664; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15665; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s[0:1]
15666; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
15667; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
15668; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
15669; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, 0
15670; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
15671; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
15672; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
15673; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
15674; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15675; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15676; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15677; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
15678; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
15679; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15680; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
15681; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
15682; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
15683; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
15684; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
15685; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
15686; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, 0
15687; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
15688; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
15689; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
15690; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
15691; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15692; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15693; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15694; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
15695; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
15696; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15697; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
15698; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
15699; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
15700; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
15701; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
15702; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
15703; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
15704; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15705; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
15706; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
15707; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
15708; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
15709; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15710; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15711; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15712; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
15713; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
15714; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15715; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 31
15716; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 31
15717; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15718; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, s2
15719; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s3
15720; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15721; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
15722; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
15723; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, 0
15724; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
15725; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
15726; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
15727; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
15728; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
15729; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
15730; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
15731; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
15732; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
15733; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 15
15734; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
15735; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 15
15736; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 31
15737; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
15738; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s2, 16
15739; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 63
15740; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s3, 16
15741; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v2, 47
15742; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v2, 63
15743; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v1, 47
15744; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s6, 32
15745; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s7, 32
15746; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
15747; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15748; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
15749; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
15750; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
15751; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
15752; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s8, 48
15753; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s9, 48
15754; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
15755; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
15756; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
15757; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
15758; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
15759; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB29_2
15760; GFX1164_DPP-NEXT:  ; %bb.1:
15761; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
15762; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
15763; GFX1164_DPP-NEXT:    ds_max_rtn_u64 v[7:8], v0, v[7:8]
15764; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15765; GFX1164_DPP-NEXT:    buffer_gl0_inv
15766; GFX1164_DPP-NEXT:  .LBB29_2:
15767; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
15768; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
15769; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s5, v8
15770; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s4, v7
15771; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v4
15772; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
15773; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
15774; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15775; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
15776; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
15777; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
15778; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15779; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
15780; GFX1164_DPP-NEXT:    s_endpgm
15781;
15782; GFX1132_DPP-LABEL: umax_i64_varying:
15783; GFX1132_DPP:       ; %bb.0: ; %entry
15784; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
15785; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
15786; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
15787; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, 0, s2
15788; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0
15789; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s2
15790; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0
15791; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
15792; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
15793; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
15794; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
15795; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4]
15796; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1
15797; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0
15798; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
15799; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
15800; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
15801; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[5:6]
15802; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1
15803; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0
15804; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
15805; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
15806; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
15807; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
15808; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4]
15809; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1
15810; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0
15811; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
15812; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
15813; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
15814; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[5:6]
15815; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1
15816; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
15817; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
15818; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
15819; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15820; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
15821; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
15822; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
15823; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4]
15824; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_cndmask_b32 v2, v4, v2
15825; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v1, v3, v1
15826; GFX1132_DPP-NEXT:    v_readlane_b32 s3, v2, 15
15827; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
15828; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
15829; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
15830; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
15831; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
15832; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
15833; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
15834; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
15835; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
15836; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
15837; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s3, 16
15838; GFX1132_DPP-NEXT:    v_writelane_b32 v4, s6, 16
15839; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
15840; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
15841; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
15842; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
15843; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
15844; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB29_2
15845; GFX1132_DPP-NEXT:  ; %bb.1:
15846; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
15847; GFX1132_DPP-NEXT:    ds_max_rtn_u64 v[7:8], v0, v[7:8]
15848; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15849; GFX1132_DPP-NEXT:    buffer_gl0_inv
15850; GFX1132_DPP-NEXT:  .LBB29_2:
15851; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
15852; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
15853; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s5, v8
15854; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
15855; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v4
15856; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
15857; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
15858; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15859; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[7:8]
15860; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
15861; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
15862; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
15863; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
15864; GFX1132_DPP-NEXT:    s_endpgm
15865entry:
15866  %lane = call i32 @llvm.amdgcn.workitem.id.x()
15867  %lane_ext = zext i32 %lane to i64
15868  %old = atomicrmw umax ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
15869  store i64 %old, ptr addrspace(1) %out
15870  ret void
15871}
15872
15873define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
15874; GFX7LESS_ITERATIVE-LABEL: umin_i32_varying:
15875; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
15876; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
15877; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
15878; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
15879; GFX7LESS_ITERATIVE-NEXT:  .LBB30_1: ; %ComputeLoop
15880; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
15881; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
15882; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s3
15883; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
15884; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
15885; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
15886; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
15887; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
15888; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
15889; GFX7LESS_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
15890; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB30_1
15891; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
15892; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
15893; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
15894; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
15895; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
15896; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
15897; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
15898; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB30_4
15899; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
15900; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
15901; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
15902; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
15903; GFX7LESS_ITERATIVE-NEXT:    ds_min_rtn_u32 v0, v0, v2
15904; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15905; GFX7LESS_ITERATIVE-NEXT:  .LBB30_4:
15906; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
15907; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
15908; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
15909; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
15910; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
15911; GFX7LESS_ITERATIVE-NEXT:    v_min_u32_e32 v0, s4, v1
15912; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15913; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
15914; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
15915;
15916; GFX8_ITERATIVE-LABEL: umin_i32_varying:
15917; GFX8_ITERATIVE:       ; %bb.0: ; %entry
15918; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
15919; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
15920; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
15921; GFX8_ITERATIVE-NEXT:  .LBB30_1: ; %ComputeLoop
15922; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
15923; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
15924; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
15925; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
15926; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
15927; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
15928; GFX8_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
15929; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
15930; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
15931; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
15932; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
15933; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
15934; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
15935; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
15936; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
15937; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
15938; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
15939; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB30_4
15940; GFX8_ITERATIVE-NEXT:  ; %bb.3:
15941; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
15942; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
15943; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
15944; GFX8_ITERATIVE-NEXT:    ds_min_rtn_u32 v0, v0, v2
15945; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15946; GFX8_ITERATIVE-NEXT:  .LBB30_4:
15947; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
15948; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
15949; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
15950; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
15951; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
15952; GFX8_ITERATIVE-NEXT:    v_min_u32_e32 v0, s4, v1
15953; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15954; GFX8_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
15955; GFX8_ITERATIVE-NEXT:    s_endpgm
15956;
15957; GFX9_ITERATIVE-LABEL: umin_i32_varying:
15958; GFX9_ITERATIVE:       ; %bb.0: ; %entry
15959; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
15960; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
15961; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
15962; GFX9_ITERATIVE-NEXT:  .LBB30_1: ; %ComputeLoop
15963; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
15964; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
15965; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
15966; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
15967; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
15968; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
15969; GFX9_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
15970; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
15971; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
15972; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
15973; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
15974; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
15975; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
15976; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
15977; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
15978; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
15979; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
15980; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB30_4
15981; GFX9_ITERATIVE-NEXT:  ; %bb.3:
15982; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
15983; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
15984; GFX9_ITERATIVE-NEXT:    ds_min_rtn_u32 v0, v0, v2
15985; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15986; GFX9_ITERATIVE-NEXT:  .LBB30_4:
15987; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
15988; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
15989; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
15990; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
15991; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
15992; GFX9_ITERATIVE-NEXT:    v_min_u32_e32 v0, s4, v1
15993; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
15994; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
15995; GFX9_ITERATIVE-NEXT:    s_endpgm
15996;
15997; GFX1064_ITERATIVE-LABEL: umin_i32_varying:
15998; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
15999; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
16000; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
16001; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
16002; GFX1064_ITERATIVE-NEXT:  .LBB30_1: ; %ComputeLoop
16003; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
16004; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
16005; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
16006; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
16007; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
16008; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
16009; GFX1064_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
16010; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
16011; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
16012; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
16013; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16014; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
16015; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
16016; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
16017; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
16018; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
16019; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB30_4
16020; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
16021; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
16022; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
16023; GFX1064_ITERATIVE-NEXT:    ds_min_rtn_u32 v0, v0, v2
16024; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16025; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
16026; GFX1064_ITERATIVE-NEXT:  .LBB30_4:
16027; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
16028; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
16029; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16030; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
16031; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
16032; GFX1064_ITERATIVE-NEXT:    v_min_u32_e32 v0, s2, v1
16033; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
16034; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16035; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
16036; GFX1064_ITERATIVE-NEXT:    s_endpgm
16037;
16038; GFX1032_ITERATIVE-LABEL: umin_i32_varying:
16039; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
16040; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
16041; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, -1
16042; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
16043; GFX1032_ITERATIVE-NEXT:  .LBB30_1: ; %ComputeLoop
16044; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
16045; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
16046; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
16047; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
16048; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
16049; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
16050; GFX1032_ITERATIVE-NEXT:    s_min_u32 s0, s0, s3
16051; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
16052; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
16053; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
16054; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16055; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
16056; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
16057; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
16058; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
16059; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB30_4
16060; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
16061; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
16062; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
16063; GFX1032_ITERATIVE-NEXT:    ds_min_rtn_u32 v0, v0, v2
16064; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16065; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
16066; GFX1032_ITERATIVE-NEXT:  .LBB30_4:
16067; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
16068; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
16069; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16070; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
16071; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
16072; GFX1032_ITERATIVE-NEXT:    v_min_u32_e32 v0, s2, v1
16073; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
16074; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16075; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
16076; GFX1032_ITERATIVE-NEXT:    s_endpgm
16077;
16078; GFX1164_ITERATIVE-LABEL: umin_i32_varying:
16079; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
16080; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
16081; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
16082; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
16083; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
16084; GFX1164_ITERATIVE-NEXT:  .LBB30_1: ; %ComputeLoop
16085; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
16086; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
16087; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
16088; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
16089; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
16090; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
16091; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
16092; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
16093; GFX1164_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
16094; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
16095; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
16096; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
16097; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
16098; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16099; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
16100; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
16101; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
16102; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
16103; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
16104; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
16105; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB30_4
16106; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
16107; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
16108; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
16109; GFX1164_ITERATIVE-NEXT:    ds_min_rtn_u32 v1, v1, v2
16110; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16111; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
16112; GFX1164_ITERATIVE-NEXT:  .LBB30_4:
16113; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
16114; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
16115; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
16116; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
16117; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16118; GFX1164_ITERATIVE-NEXT:    v_min_u32_e32 v0, s2, v0
16119; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
16120; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16121; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
16122; GFX1164_ITERATIVE-NEXT:    s_endpgm
16123;
16124; GFX1132_ITERATIVE-LABEL: umin_i32_varying:
16125; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
16126; GFX1132_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
16127; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
16128; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, -1
16129; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
16130; GFX1132_ITERATIVE-NEXT:  .LBB30_1: ; %ComputeLoop
16131; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
16132; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
16133; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
16134; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
16135; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
16136; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
16137; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
16138; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
16139; GFX1132_ITERATIVE-NEXT:    s_min_u32 s0, s0, s3
16140; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
16141; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
16142; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
16143; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
16144; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
16145; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
16146; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
16147; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
16148; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
16149; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB30_4
16150; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
16151; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
16152; GFX1132_ITERATIVE-NEXT:    ds_min_rtn_u32 v1, v1, v2
16153; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16154; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
16155; GFX1132_ITERATIVE-NEXT:  .LBB30_4:
16156; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
16157; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
16158; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
16159; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
16160; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16161; GFX1132_ITERATIVE-NEXT:    v_min_u32_e32 v0, s2, v0
16162; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
16163; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16164; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
16165; GFX1132_ITERATIVE-NEXT:    s_endpgm
16166;
16167; GFX7LESS_DPP-LABEL: umin_i32_varying:
16168; GFX7LESS_DPP:       ; %bb.0: ; %entry
16169; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
16170; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
16171; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
16172; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16173; GFX7LESS_DPP-NEXT:    ds_min_rtn_u32 v0, v1, v0
16174; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16175; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
16176; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
16177; GFX7LESS_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
16178; GFX7LESS_DPP-NEXT:    s_endpgm
16179;
16180; GFX8_DPP-LABEL: umin_i32_varying:
16181; GFX8_DPP:       ; %bb.0: ; %entry
16182; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
16183; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
16184; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
16185; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
16186; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, -1
16187; GFX8_DPP-NEXT:    s_nop 0
16188; GFX8_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
16189; GFX8_DPP-NEXT:    s_nop 1
16190; GFX8_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
16191; GFX8_DPP-NEXT:    s_nop 1
16192; GFX8_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
16193; GFX8_DPP-NEXT:    s_nop 1
16194; GFX8_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
16195; GFX8_DPP-NEXT:    s_nop 1
16196; GFX8_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
16197; GFX8_DPP-NEXT:    s_nop 1
16198; GFX8_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
16199; GFX8_DPP-NEXT:    v_readlane_b32 s2, v2, 63
16200; GFX8_DPP-NEXT:    s_nop 0
16201; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
16202; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
16203; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
16204; GFX8_DPP-NEXT:    ; implicit-def: $vgpr0
16205; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
16206; GFX8_DPP-NEXT:    s_cbranch_execz .LBB30_2
16207; GFX8_DPP-NEXT:  ; %bb.1:
16208; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, 0
16209; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, s2
16210; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
16211; GFX8_DPP-NEXT:    ds_min_rtn_u32 v0, v0, v3
16212; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16213; GFX8_DPP-NEXT:  .LBB30_2:
16214; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
16215; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16216; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v0
16217; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, v1
16218; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
16219; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
16220; GFX8_DPP-NEXT:    v_min_u32_e32 v0, s4, v0
16221; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16222; GFX8_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
16223; GFX8_DPP-NEXT:    s_endpgm
16224;
16225; GFX9_DPP-LABEL: umin_i32_varying:
16226; GFX9_DPP:       ; %bb.0: ; %entry
16227; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
16228; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
16229; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
16230; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
16231; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, -1
16232; GFX9_DPP-NEXT:    s_nop 0
16233; GFX9_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
16234; GFX9_DPP-NEXT:    s_nop 1
16235; GFX9_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
16236; GFX9_DPP-NEXT:    s_nop 1
16237; GFX9_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
16238; GFX9_DPP-NEXT:    s_nop 1
16239; GFX9_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
16240; GFX9_DPP-NEXT:    s_nop 1
16241; GFX9_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
16242; GFX9_DPP-NEXT:    s_nop 1
16243; GFX9_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
16244; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
16245; GFX9_DPP-NEXT:    s_nop 0
16246; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
16247; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
16248; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
16249; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
16250; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
16251; GFX9_DPP-NEXT:    s_cbranch_execz .LBB30_2
16252; GFX9_DPP-NEXT:  ; %bb.1:
16253; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, 0
16254; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, s2
16255; GFX9_DPP-NEXT:    ds_min_rtn_u32 v0, v0, v3
16256; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16257; GFX9_DPP-NEXT:  .LBB30_2:
16258; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
16259; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16260; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
16261; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v1
16262; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
16263; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
16264; GFX9_DPP-NEXT:    v_min_u32_e32 v0, s4, v0
16265; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16266; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
16267; GFX9_DPP-NEXT:    s_endpgm
16268;
16269; GFX1064_DPP-LABEL: umin_i32_varying:
16270; GFX1064_DPP:       ; %bb.0: ; %entry
16271; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
16272; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
16273; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, -1
16274; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
16275; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
16276; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
16277; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
16278; GFX1064_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
16279; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
16280; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
16281; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s2
16282; GFX1064_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
16283; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
16284; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 15
16285; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 31
16286; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 16
16287; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
16288; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16289; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
16290; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 47
16291; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 63
16292; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s3, 32
16293; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
16294; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
16295; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
16296; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 48
16297; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
16298; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
16299; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
16300; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
16301; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
16302; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB30_2
16303; GFX1064_DPP-NEXT:  ; %bb.1:
16304; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
16305; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, s6
16306; GFX1064_DPP-NEXT:    s_mov_b32 s3, s6
16307; GFX1064_DPP-NEXT:    ds_min_rtn_u32 v0, v0, v4
16308; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16309; GFX1064_DPP-NEXT:    buffer_gl0_inv
16310; GFX1064_DPP-NEXT:  .LBB30_2:
16311; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
16312; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
16313; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16314; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v0
16315; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
16316; GFX1064_DPP-NEXT:    v_min_u32_e32 v0, s3, v0
16317; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
16318; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16319; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
16320; GFX1064_DPP-NEXT:    s_endpgm
16321;
16322; GFX1032_DPP-LABEL: umin_i32_varying:
16323; GFX1032_DPP:       ; %bb.0: ; %entry
16324; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
16325; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
16326; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, -1
16327; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
16328; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
16329; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
16330; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
16331; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
16332; GFX1032_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
16333; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
16334; GFX1032_DPP-NEXT:    v_readlane_b32 s2, v1, 31
16335; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
16336; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
16337; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16338; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
16339; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
16340; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
16341; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
16342; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
16343; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
16344; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
16345; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
16346; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB30_2
16347; GFX1032_DPP-NEXT:  ; %bb.1:
16348; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
16349; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, s0
16350; GFX1032_DPP-NEXT:    ds_min_rtn_u32 v0, v0, v4
16351; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16352; GFX1032_DPP-NEXT:    buffer_gl0_inv
16353; GFX1032_DPP-NEXT:  .LBB30_2:
16354; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
16355; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
16356; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16357; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v0
16358; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
16359; GFX1032_DPP-NEXT:    v_min_u32_e32 v0, s3, v0
16360; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
16361; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16362; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
16363; GFX1032_DPP-NEXT:    s_endpgm
16364;
16365; GFX1164_DPP-LABEL: umin_i32_varying:
16366; GFX1164_DPP:       ; %bb.0: ; %entry
16367; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
16368; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
16369; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
16370; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
16371; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, -1
16372; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
16373; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
16374; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
16375; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16376; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
16377; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
16378; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16379; GFX1164_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
16380; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
16381; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16382; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 31
16383; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, s2
16384; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16385; GFX1164_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
16386; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
16387; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 15
16388; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 31
16389; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
16390; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 16
16391; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
16392; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16393; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
16394; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 47
16395; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v1, 63
16396; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s3, 32
16397; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
16398; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
16399; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
16400; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
16401; GFX1164_DPP-NEXT:    v_writelane_b32 v3, s2, 48
16402; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
16403; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
16404; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
16405; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr0
16406; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
16407; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB30_2
16408; GFX1164_DPP-NEXT:  ; %bb.1:
16409; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
16410; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, s6
16411; GFX1164_DPP-NEXT:    s_mov_b32 s3, s6
16412; GFX1164_DPP-NEXT:    ds_min_rtn_u32 v0, v0, v4
16413; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16414; GFX1164_DPP-NEXT:    buffer_gl0_inv
16415; GFX1164_DPP-NEXT:  .LBB30_2:
16416; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
16417; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
16418; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v0
16419; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, v3
16420; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16421; GFX1164_DPP-NEXT:    v_min_u32_e32 v0, s3, v0
16422; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
16423; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16424; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
16425; GFX1164_DPP-NEXT:    s_endpgm
16426;
16427; GFX1132_DPP-LABEL: umin_i32_varying:
16428; GFX1132_DPP:       ; %bb.0: ; %entry
16429; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
16430; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
16431; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
16432; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s0
16433; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, -1
16434; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
16435; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
16436; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
16437; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16438; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
16439; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
16440; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16441; GFX1132_DPP-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
16442; GFX1132_DPP-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
16443; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
16444; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v1, 15
16445; GFX1132_DPP-NEXT:    v_readlane_b32 s2, v1, 31
16446; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
16447; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
16448; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16449; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
16450; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
16451; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
16452; GFX1132_DPP-NEXT:    s_mov_b32 s0, s2
16453; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
16454; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
16455; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
16456; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
16457; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB30_2
16458; GFX1132_DPP-NEXT:  ; %bb.1:
16459; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
16460; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, s0
16461; GFX1132_DPP-NEXT:    ds_min_rtn_u32 v0, v0, v4
16462; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16463; GFX1132_DPP-NEXT:    buffer_gl0_inv
16464; GFX1132_DPP-NEXT:  .LBB30_2:
16465; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s1
16466; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
16467; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v0
16468; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, v3
16469; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16470; GFX1132_DPP-NEXT:    v_min_u32_e32 v0, s3, v0
16471; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
16472; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
16473; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
16474; GFX1132_DPP-NEXT:    s_endpgm
16475entry:
16476  %lane = call i32 @llvm.amdgcn.workitem.id.x()
16477  %old = atomicrmw umin ptr addrspace(3) @local_var32, i32 %lane acq_rel
16478  store i32 %old, ptr addrspace(1) %out
16479  ret void
16480}
16481
16482define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
16483; GFX7LESS-LABEL: umin_i64_constant:
16484; GFX7LESS:       ; %bb.0: ; %entry
16485; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
16486; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
16487; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
16488; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
16489; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
16490; GFX7LESS-NEXT:    s_cbranch_execz .LBB31_2
16491; GFX7LESS-NEXT:  ; %bb.1:
16492; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
16493; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
16494; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
16495; GFX7LESS-NEXT:    s_mov_b32 m0, -1
16496; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
16497; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
16498; GFX7LESS-NEXT:  .LBB31_2:
16499; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
16500; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
16501; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
16502; GFX7LESS-NEXT:    s_mov_b32 s2, -1
16503; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
16504; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
16505; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
16506; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
16507; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
16508; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
16509; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
16510; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
16511; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
16512; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
16513; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
16514; GFX7LESS-NEXT:    s_endpgm
16515;
16516; GFX8-LABEL: umin_i64_constant:
16517; GFX8:       ; %bb.0: ; %entry
16518; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16519; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
16520; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
16521; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
16522; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
16523; GFX8-NEXT:    s_cbranch_execz .LBB31_2
16524; GFX8-NEXT:  ; %bb.1:
16525; GFX8-NEXT:    v_mov_b32_e32 v0, 5
16526; GFX8-NEXT:    v_mov_b32_e32 v1, 0
16527; GFX8-NEXT:    v_mov_b32_e32 v2, 0
16528; GFX8-NEXT:    s_mov_b32 m0, -1
16529; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
16530; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
16531; GFX8-NEXT:  .LBB31_2:
16532; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
16533; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16534; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
16535; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
16536; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
16537; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
16538; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
16539; GFX8-NEXT:    v_mov_b32_e32 v2, s5
16540; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
16541; GFX8-NEXT:    v_mov_b32_e32 v2, s4
16542; GFX8-NEXT:    s_mov_b32 s3, 0xf000
16543; GFX8-NEXT:    s_mov_b32 s2, -1
16544; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
16545; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
16546; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
16547; GFX8-NEXT:    s_endpgm
16548;
16549; GFX9-LABEL: umin_i64_constant:
16550; GFX9:       ; %bb.0: ; %entry
16551; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16552; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
16553; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
16554; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
16555; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
16556; GFX9-NEXT:    s_cbranch_execz .LBB31_2
16557; GFX9-NEXT:  ; %bb.1:
16558; GFX9-NEXT:    v_mov_b32_e32 v0, 5
16559; GFX9-NEXT:    v_mov_b32_e32 v1, 0
16560; GFX9-NEXT:    v_mov_b32_e32 v2, 0
16561; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
16562; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
16563; GFX9-NEXT:  .LBB31_2:
16564; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
16565; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16566; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
16567; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
16568; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
16569; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
16570; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
16571; GFX9-NEXT:    v_mov_b32_e32 v2, s5
16572; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
16573; GFX9-NEXT:    v_mov_b32_e32 v2, s4
16574; GFX9-NEXT:    s_mov_b32 s3, 0xf000
16575; GFX9-NEXT:    s_mov_b32 s2, -1
16576; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
16577; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
16578; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
16579; GFX9-NEXT:    s_endpgm
16580;
16581; GFX1064-LABEL: umin_i64_constant:
16582; GFX1064:       ; %bb.0: ; %entry
16583; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16584; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
16585; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
16586; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
16587; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
16588; GFX1064-NEXT:    s_cbranch_execz .LBB31_2
16589; GFX1064-NEXT:  ; %bb.1:
16590; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
16591; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
16592; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
16593; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
16594; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
16595; GFX1064-NEXT:    buffer_gl0_inv
16596; GFX1064-NEXT:  .LBB31_2:
16597; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
16598; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
16599; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16600; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
16601; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
16602; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
16603; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
16604; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
16605; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
16606; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
16607; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
16608; GFX1064-NEXT:    s_mov_b32 s2, -1
16609; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
16610; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
16611; GFX1064-NEXT:    s_endpgm
16612;
16613; GFX1032-LABEL: umin_i64_constant:
16614; GFX1032:       ; %bb.0: ; %entry
16615; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16616; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
16617; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
16618; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
16619; GFX1032-NEXT:    s_cbranch_execz .LBB31_2
16620; GFX1032-NEXT:  ; %bb.1:
16621; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
16622; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
16623; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
16624; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
16625; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
16626; GFX1032-NEXT:    buffer_gl0_inv
16627; GFX1032-NEXT:  .LBB31_2:
16628; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
16629; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
16630; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16631; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
16632; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
16633; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
16634; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
16635; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
16636; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
16637; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
16638; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
16639; GFX1032-NEXT:    s_mov_b32 s2, -1
16640; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
16641; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
16642; GFX1032-NEXT:    s_endpgm
16643;
16644; GFX1164-LABEL: umin_i64_constant:
16645; GFX1164:       ; %bb.0: ; %entry
16646; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16647; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16648; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
16649; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
16650; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
16651; GFX1164-NEXT:    s_and_saveexec_b64 s[0:1], vcc
16652; GFX1164-NEXT:    s_cbranch_execz .LBB31_2
16653; GFX1164-NEXT:  ; %bb.1:
16654; GFX1164-NEXT:    v_mov_b32_e32 v0, 5
16655; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
16656; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
16657; GFX1164-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
16658; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
16659; GFX1164-NEXT:    buffer_gl0_inv
16660; GFX1164-NEXT:  .LBB31_2:
16661; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
16662; GFX1164-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
16663; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
16664; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
16665; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
16666; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
16667; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16668; GFX1164-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
16669; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
16670; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
16671; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
16672; GFX1164-NEXT:    s_mov_b32 s2, -1
16673; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
16674; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
16675; GFX1164-NEXT:    s_endpgm
16676;
16677; GFX1132-LABEL: umin_i64_constant:
16678; GFX1132:       ; %bb.0: ; %entry
16679; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16680; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16681; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
16682; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
16683; GFX1132-NEXT:    s_and_saveexec_b32 s0, vcc_lo
16684; GFX1132-NEXT:    s_cbranch_execz .LBB31_2
16685; GFX1132-NEXT:  ; %bb.1:
16686; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
16687; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
16688; GFX1132-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
16689; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
16690; GFX1132-NEXT:    buffer_gl0_inv
16691; GFX1132-NEXT:  .LBB31_2:
16692; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s0
16693; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
16694; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
16695; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
16696; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
16697; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
16698; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16699; GFX1132-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
16700; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
16701; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
16702; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
16703; GFX1132-NEXT:    s_mov_b32 s2, -1
16704; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
16705; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
16706; GFX1132-NEXT:    s_endpgm
16707entry:
16708  %old = atomicrmw umin ptr addrspace(3) @local_var64, i64 5 acq_rel
16709  store i64 %old, ptr addrspace(1) %out
16710  ret void
16711}
16712
16713define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
16714; GFX7LESS_ITERATIVE-LABEL: umin_i64_varying:
16715; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
16716; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
16717; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
16718; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
16719; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
16720; GFX7LESS_ITERATIVE-NEXT:  .LBB32_1: ; %ComputeLoop
16721; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
16722; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
16723; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s8
16724; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
16725; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
16726; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
16727; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
16728; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
16729; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
16730; GFX7LESS_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
16731; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
16732; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
16733; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
16734; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
16735; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
16736; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
16737; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
16738; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB32_1
16739; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
16740; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
16741; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
16742; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
16743; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
16744; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
16745; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
16746; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB32_4
16747; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
16748; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
16749; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
16750; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
16751; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, -1
16752; GFX7LESS_ITERATIVE-NEXT:    ds_min_rtn_u64 v[3:4], v0, v[3:4]
16753; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16754; GFX7LESS_ITERATIVE-NEXT:  .LBB32_4:
16755; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
16756; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
16757; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
16758; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
16759; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
16760; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
16761; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
16762; GFX7LESS_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2]
16763; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
16764; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
16765; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
16766; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16767; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
16768; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
16769;
16770; GFX8_ITERATIVE-LABEL: umin_i64_varying:
16771; GFX8_ITERATIVE:       ; %bb.0: ; %entry
16772; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
16773; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
16774; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
16775; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
16776; GFX8_ITERATIVE-NEXT:  .LBB32_1: ; %ComputeLoop
16777; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
16778; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
16779; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
16780; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
16781; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
16782; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
16783; GFX8_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
16784; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
16785; GFX8_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
16786; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
16787; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
16788; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
16789; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
16790; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
16791; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
16792; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
16793; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
16794; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
16795; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16796; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
16797; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
16798; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
16799; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
16800; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
16801; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB32_4
16802; GFX8_ITERATIVE-NEXT:  ; %bb.3:
16803; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
16804; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
16805; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
16806; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, -1
16807; GFX8_ITERATIVE-NEXT:    ds_min_rtn_u64 v[3:4], v0, v[3:4]
16808; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16809; GFX8_ITERATIVE-NEXT:  .LBB32_4:
16810; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
16811; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16812; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
16813; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
16814; GFX8_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2]
16815; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
16816; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
16817; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
16818; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
16819; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
16820; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
16821; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16822; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
16823; GFX8_ITERATIVE-NEXT:    s_endpgm
16824;
16825; GFX9_ITERATIVE-LABEL: umin_i64_varying:
16826; GFX9_ITERATIVE:       ; %bb.0: ; %entry
16827; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
16828; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
16829; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
16830; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
16831; GFX9_ITERATIVE-NEXT:  .LBB32_1: ; %ComputeLoop
16832; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
16833; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
16834; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
16835; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
16836; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
16837; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
16838; GFX9_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
16839; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
16840; GFX9_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
16841; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
16842; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
16843; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
16844; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
16845; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
16846; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
16847; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
16848; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
16849; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
16850; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16851; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
16852; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
16853; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
16854; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
16855; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
16856; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB32_4
16857; GFX9_ITERATIVE-NEXT:  ; %bb.3:
16858; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
16859; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
16860; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
16861; GFX9_ITERATIVE-NEXT:    ds_min_rtn_u64 v[3:4], v0, v[3:4]
16862; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16863; GFX9_ITERATIVE-NEXT:  .LBB32_4:
16864; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
16865; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16866; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
16867; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
16868; GFX9_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2]
16869; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
16870; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
16871; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
16872; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
16873; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
16874; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
16875; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16876; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
16877; GFX9_ITERATIVE-NEXT:    s_endpgm
16878;
16879; GFX1064_ITERATIVE-LABEL: umin_i64_varying:
16880; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
16881; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
16882; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
16883; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
16884; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
16885; GFX1064_ITERATIVE-NEXT:  .LBB32_1: ; %ComputeLoop
16886; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
16887; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
16888; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
16889; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
16890; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
16891; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
16892; GFX1064_ITERATIVE-NEXT:    v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7]
16893; GFX1064_ITERATIVE-NEXT:    s_and_b64 s[8:9], s[8:9], exec
16894; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
16895; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
16896; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
16897; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
16898; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
16899; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
16900; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
16901; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16902; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
16903; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
16904; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
16905; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
16906; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
16907; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB32_4
16908; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
16909; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
16910; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
16911; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
16912; GFX1064_ITERATIVE-NEXT:    ds_min_rtn_u64 v[3:4], v0, v[3:4]
16913; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16914; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
16915; GFX1064_ITERATIVE-NEXT:  .LBB32_4:
16916; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
16917; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
16918; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16919; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
16920; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
16921; GFX1064_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
16922; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc
16923; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc
16924; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
16925; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
16926; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16927; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
16928; GFX1064_ITERATIVE-NEXT:    s_endpgm
16929;
16930; GFX1032_ITERATIVE-LABEL: umin_i64_varying:
16931; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
16932; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
16933; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
16934; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
16935; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
16936; GFX1032_ITERATIVE-NEXT:  .LBB32_1: ; %ComputeLoop
16937; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
16938; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
16939; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
16940; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
16941; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
16942; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
16943; GFX1032_ITERATIVE-NEXT:    v_cmp_lt_u64_e64 s8, s[0:1], s[6:7]
16944; GFX1032_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
16945; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
16946; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
16947; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
16948; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
16949; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
16950; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
16951; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
16952; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
16953; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
16954; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
16955; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
16956; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
16957; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB32_4
16958; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
16959; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
16960; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
16961; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
16962; GFX1032_ITERATIVE-NEXT:    ds_min_rtn_u64 v[3:4], v0, v[3:4]
16963; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16964; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
16965; GFX1032_ITERATIVE-NEXT:  .LBB32_4:
16966; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
16967; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
16968; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
16969; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
16970; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
16971; GFX1032_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[1:2]
16972; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc_lo
16973; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
16974; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
16975; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
16976; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
16977; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
16978; GFX1032_ITERATIVE-NEXT:    s_endpgm
16979;
16980; GFX1164_ITERATIVE-LABEL: umin_i64_varying:
16981; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
16982; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
16983; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
16984; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
16985; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
16986; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
16987; GFX1164_ITERATIVE-NEXT:    .p2align 6
16988; GFX1164_ITERATIVE-NEXT:  .LBB32_1: ; %ComputeLoop
16989; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
16990; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
16991; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
16992; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
16993; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
16994; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
16995; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
16996; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
16997; GFX1164_ITERATIVE-NEXT:    v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7]
16998; GFX1164_ITERATIVE-NEXT:    s_and_b64 s[8:9], s[8:9], exec
16999; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
17000; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
17001; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
17002; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
17003; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
17004; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
17005; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
17006; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
17007; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
17008; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17009; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
17010; GFX1164_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
17011; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
17012; GFX1164_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
17013; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
17014; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
17015; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB32_4
17016; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
17017; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s1
17018; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
17019; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
17020; GFX1164_ITERATIVE-NEXT:    ds_min_rtn_u64 v[2:3], v4, v[2:3]
17021; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
17022; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
17023; GFX1164_ITERATIVE-NEXT:  .LBB32_4:
17024; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
17025; GFX1164_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
17026; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
17027; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
17028; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
17029; GFX1164_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
17030; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
17031; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
17032; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
17033; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
17034; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
17035; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
17036; GFX1164_ITERATIVE-NEXT:    s_endpgm
17037;
17038; GFX1132_ITERATIVE-LABEL: umin_i64_varying:
17039; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
17040; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
17041; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
17042; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
17043; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
17044; GFX1132_ITERATIVE-NEXT:    .p2align 6
17045; GFX1132_ITERATIVE-NEXT:  .LBB32_1: ; %ComputeLoop
17046; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
17047; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
17048; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
17049; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
17050; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
17051; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
17052; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
17053; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
17054; GFX1132_ITERATIVE-NEXT:    v_cmp_lt_u64_e64 s8, s[0:1], s[6:7]
17055; GFX1132_ITERATIVE-NEXT:    s_and_b32 s8, s8, exec_lo
17056; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
17057; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
17058; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
17059; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
17060; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
17061; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
17062; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
17063; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
17064; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
17065; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
17066; GFX1132_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
17067; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
17068; GFX1132_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
17069; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
17070; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB32_4
17071; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
17072; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1
17073; GFX1132_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s0
17074; GFX1132_ITERATIVE-NEXT:    ds_min_rtn_u64 v[2:3], v4, v[2:3]
17075; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
17076; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
17077; GFX1132_ITERATIVE-NEXT:  .LBB32_4:
17078; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
17079; GFX1132_ITERATIVE-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
17080; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
17081; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
17082; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
17083; GFX1132_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
17084; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
17085; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
17086; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
17087; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
17088; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
17089; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
17090; GFX1132_ITERATIVE-NEXT:    s_endpgm
17091;
17092; GFX7LESS_DPP-LABEL: umin_i64_varying:
17093; GFX7LESS_DPP:       ; %bb.0: ; %entry
17094; GFX7LESS_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
17095; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
17096; GFX7LESS_DPP-NEXT:    s_mov_b32 m0, -1
17097; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17098; GFX7LESS_DPP-NEXT:    ds_min_rtn_u64 v[0:1], v1, v[0:1]
17099; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17100; GFX7LESS_DPP-NEXT:    s_mov_b32 s3, 0xf000
17101; GFX7LESS_DPP-NEXT:    s_mov_b32 s2, -1
17102; GFX7LESS_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
17103; GFX7LESS_DPP-NEXT:    s_endpgm
17104;
17105; GFX8_DPP-LABEL: umin_i64_varying:
17106; GFX8_DPP:       ; %bb.0: ; %entry
17107; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
17108; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
17109; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v6
17110; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
17111; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, 0, s[0:1]
17112; GFX8_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
17113; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, -1
17114; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, -1
17115; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, -1
17116; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
17117; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
17118; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
17119; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
17120; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17121; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, -1
17122; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, -1
17123; GFX8_DPP-NEXT:    s_nop 0
17124; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
17125; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
17126; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
17127; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
17128; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17129; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, -1
17130; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, -1
17131; GFX8_DPP-NEXT:    s_nop 0
17132; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
17133; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
17134; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
17135; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
17136; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17137; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, -1
17138; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, -1
17139; GFX8_DPP-NEXT:    s_nop 0
17140; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
17141; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
17142; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
17143; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
17144; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17145; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, -1
17146; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, -1
17147; GFX8_DPP-NEXT:    s_nop 0
17148; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
17149; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
17150; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
17151; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
17152; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17153; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, -1
17154; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, -1
17155; GFX8_DPP-NEXT:    s_nop 0
17156; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
17157; GFX8_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
17158; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
17159; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
17160; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
17161; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, -1
17162; GFX8_DPP-NEXT:    v_readlane_b32 s3, v3, 63
17163; GFX8_DPP-NEXT:    v_readlane_b32 s2, v4, 63
17164; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
17165; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
17166; GFX8_DPP-NEXT:    s_mov_b64 exec, s[0:1]
17167; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
17168; GFX8_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
17169; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
17170; GFX8_DPP-NEXT:    s_cbranch_execz .LBB32_2
17171; GFX8_DPP-NEXT:  ; %bb.1:
17172; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s3
17173; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, s2
17174; GFX8_DPP-NEXT:    s_mov_b32 m0, -1
17175; GFX8_DPP-NEXT:    ds_min_rtn_u64 v[6:7], v8, v[6:7]
17176; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17177; GFX8_DPP-NEXT:  .LBB32_2:
17178; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
17179; GFX8_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
17180; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
17181; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v6
17182; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v1
17183; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v2
17184; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
17185; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
17186; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
17187; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
17188; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
17189; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
17190; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
17191; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17192; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
17193; GFX8_DPP-NEXT:    s_endpgm
17194;
17195; GFX9_DPP-LABEL: umin_i64_varying:
17196; GFX9_DPP:       ; %bb.0: ; %entry
17197; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
17198; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
17199; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v6
17200; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
17201; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v3, -1, 0, s[0:1]
17202; GFX9_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
17203; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, -1
17204; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, -1
17205; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, -1
17206; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
17207; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
17208; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
17209; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
17210; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17211; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, -1
17212; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, -1
17213; GFX9_DPP-NEXT:    s_nop 0
17214; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
17215; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
17216; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
17217; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
17218; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17219; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, -1
17220; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, -1
17221; GFX9_DPP-NEXT:    s_nop 0
17222; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
17223; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
17224; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
17225; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
17226; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17227; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, -1
17228; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, -1
17229; GFX9_DPP-NEXT:    s_nop 0
17230; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
17231; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
17232; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
17233; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
17234; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17235; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, -1
17236; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, -1
17237; GFX9_DPP-NEXT:    s_nop 0
17238; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
17239; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
17240; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
17241; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
17242; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17243; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, -1
17244; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, -1
17245; GFX9_DPP-NEXT:    s_nop 0
17246; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
17247; GFX9_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
17248; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
17249; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
17250; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
17251; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, -1
17252; GFX9_DPP-NEXT:    v_readlane_b32 s3, v3, 63
17253; GFX9_DPP-NEXT:    v_readlane_b32 s2, v4, 63
17254; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
17255; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
17256; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
17257; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
17258; GFX9_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
17259; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
17260; GFX9_DPP-NEXT:    s_cbranch_execz .LBB32_2
17261; GFX9_DPP-NEXT:  ; %bb.1:
17262; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s3
17263; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, s2
17264; GFX9_DPP-NEXT:    ds_min_rtn_u64 v[6:7], v8, v[6:7]
17265; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17266; GFX9_DPP-NEXT:  .LBB32_2:
17267; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
17268; GFX9_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
17269; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
17270; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v6
17271; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v1
17272; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v2
17273; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
17274; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
17275; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
17276; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
17277; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
17278; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
17279; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
17280; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17281; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
17282; GFX9_DPP-NEXT:    s_endpgm
17283;
17284; GFX1064_DPP-LABEL: umin_i64_varying:
17285; GFX1064_DPP:       ; %bb.0: ; %entry
17286; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
17287; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, 0, s[0:1]
17288; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, -1
17289; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, -1
17290; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
17291; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, -1
17292; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, -1
17293; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
17294; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
17295; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
17296; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17297; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
17298; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, -1
17299; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, -1
17300; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
17301; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
17302; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
17303; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
17304; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
17305; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, -1
17306; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, -1
17307; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
17308; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
17309; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
17310; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17311; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
17312; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, -1
17313; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, -1
17314; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
17315; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
17316; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
17317; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
17318; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
17319; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
17320; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
17321; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
17322; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
17323; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
17324; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17325; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
17326; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, -1
17327; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, -1
17328; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 31
17329; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 31
17330; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, s2
17331; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s3
17332; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
17333; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
17334; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, -1
17335; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
17336; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17337; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, -1
17338; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
17339; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
17340; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
17341; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
17342; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
17343; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v2, 15
17344; GFX1064_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
17345; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v1, 15
17346; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v2, 31
17347; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
17348; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s2, 16
17349; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 63
17350; GFX1064_DPP-NEXT:    v_writelane_b32 v4, s3, 16
17351; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v2, 47
17352; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v2, 63
17353; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v1, 47
17354; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s6, 32
17355; GFX1064_DPP-NEXT:    v_writelane_b32 v4, s7, 32
17356; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
17357; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
17358; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, 0
17359; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
17360; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
17361; GFX1064_DPP-NEXT:    v_writelane_b32 v5, s8, 48
17362; GFX1064_DPP-NEXT:    v_writelane_b32 v4, s9, 48
17363; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
17364; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
17365; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
17366; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
17367; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
17368; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB32_2
17369; GFX1064_DPP-NEXT:  ; %bb.1:
17370; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s1
17371; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, s0
17372; GFX1064_DPP-NEXT:    ds_min_rtn_u64 v[7:8], v0, v[7:8]
17373; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17374; GFX1064_DPP-NEXT:    buffer_gl0_inv
17375; GFX1064_DPP-NEXT:  .LBB32_2:
17376; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
17377; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
17378; GFX1064_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
17379; GFX1064_DPP-NEXT:    s_mov_b32 null, 0
17380; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s5, v8
17381; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s4, v7
17382; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v4
17383; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
17384; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
17385; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
17386; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
17387; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
17388; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17389; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
17390; GFX1064_DPP-NEXT:    s_endpgm
17391;
17392; GFX1032_DPP-LABEL: umin_i64_varying:
17393; GFX1032_DPP:       ; %bb.0: ; %entry
17394; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
17395; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, 0, s2
17396; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, -1
17397; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, -1
17398; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s2
17399; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, -1
17400; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, -1
17401; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
17402; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
17403; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4]
17404; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
17405; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
17406; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, -1
17407; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, -1
17408; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
17409; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
17410; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[5:6]
17411; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
17412; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
17413; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, -1
17414; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, -1
17415; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
17416; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
17417; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4]
17418; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
17419; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
17420; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, -1
17421; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, -1
17422; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
17423; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
17424; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[5:6]
17425; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
17426; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
17427; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
17428; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
17429; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
17430; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
17431; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, -1
17432; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4]
17433; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
17434; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
17435; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, -1
17436; GFX1032_DPP-NEXT:    v_readlane_b32 s3, v2, 15
17437; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v2, 31
17438; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v1, 31
17439; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
17440; GFX1032_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
17441; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v1, 15
17442; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
17443; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
17444; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
17445; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
17446; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s3, 16
17447; GFX1032_DPP-NEXT:    v_writelane_b32 v4, s6, 16
17448; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
17449; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
17450; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
17451; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
17452; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
17453; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB32_2
17454; GFX1032_DPP-NEXT:  ; %bb.1:
17455; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, s1
17456; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, s0
17457; GFX1032_DPP-NEXT:    ds_min_rtn_u64 v[7:8], v0, v[7:8]
17458; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17459; GFX1032_DPP-NEXT:    buffer_gl0_inv
17460; GFX1032_DPP-NEXT:  .LBB32_2:
17461; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
17462; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
17463; GFX1032_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
17464; GFX1032_DPP-NEXT:    s_mov_b32 null, 0
17465; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s5, v8
17466; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s4, v7
17467; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v4
17468; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
17469; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
17470; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[7:8]
17471; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
17472; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
17473; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17474; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
17475; GFX1032_DPP-NEXT:    s_endpgm
17476;
17477; GFX1164_DPP-LABEL: umin_i64_varying:
17478; GFX1164_DPP:       ; %bb.0: ; %entry
17479; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
17480; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
17481; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
17482; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, 0, s[0:1]
17483; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, -1
17484; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, -1
17485; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s[0:1]
17486; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, -1
17487; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, -1
17488; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
17489; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
17490; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
17491; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
17492; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17493; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
17494; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, -1
17495; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, -1
17496; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
17497; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
17498; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
17499; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
17500; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
17501; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
17502; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
17503; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, -1
17504; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, -1
17505; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
17506; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
17507; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
17508; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
17509; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17510; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
17511; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, -1
17512; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, -1
17513; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
17514; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
17515; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
17516; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
17517; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
17518; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
17519; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
17520; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
17521; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17522; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
17523; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
17524; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
17525; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
17526; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
17527; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17528; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
17529; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, -1
17530; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, -1
17531; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
17532; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 31
17533; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 31
17534; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17535; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, s2
17536; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s3
17537; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17538; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
17539; GFX1164_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
17540; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, -1
17541; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
17542; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
17543; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
17544; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, -1
17545; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
17546; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
17547; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
17548; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
17549; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
17550; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v2, 15
17551; GFX1164_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
17552; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v1, 15
17553; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v2, 31
17554; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v1, 31
17555; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s2, 16
17556; GFX1164_DPP-NEXT:    v_readlane_b32 s2, v1, 63
17557; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s3, 16
17558; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v2, 47
17559; GFX1164_DPP-NEXT:    v_readlane_b32 s3, v2, 63
17560; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v1, 47
17561; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s6, 32
17562; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s7, 32
17563; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
17564; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
17565; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
17566; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
17567; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
17568; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
17569; GFX1164_DPP-NEXT:    v_writelane_b32 v5, s8, 48
17570; GFX1164_DPP-NEXT:    v_writelane_b32 v4, s9, 48
17571; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[6:7]
17572; GFX1164_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
17573; GFX1164_DPP-NEXT:    s_mov_b32 s2, -1
17574; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
17575; GFX1164_DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
17576; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB32_2
17577; GFX1164_DPP-NEXT:  ; %bb.1:
17578; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s1
17579; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s0
17580; GFX1164_DPP-NEXT:    ds_min_rtn_u64 v[7:8], v0, v[7:8]
17581; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17582; GFX1164_DPP-NEXT:    buffer_gl0_inv
17583; GFX1164_DPP-NEXT:  .LBB32_2:
17584; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[6:7]
17585; GFX1164_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
17586; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s5, v8
17587; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s4, v7
17588; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v4
17589; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
17590; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
17591; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
17592; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
17593; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
17594; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
17595; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17596; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
17597; GFX1164_DPP-NEXT:    s_endpgm
17598;
17599; GFX1132_DPP-LABEL: umin_i64_varying:
17600; GFX1132_DPP:       ; %bb.0: ; %entry
17601; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
17602; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
17603; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
17604; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, 0, s2
17605; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, -1 :: v_dual_mov_b32 v3, -1
17606; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, v0, s2
17607; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, -1 :: v_dual_mov_b32 v5, -1
17608; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
17609; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
17610; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
17611; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
17612; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4]
17613; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1
17614; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, -1 :: v_dual_mov_b32 v3, -1
17615; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
17616; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
17617; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
17618; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[5:6]
17619; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1
17620; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, -1 :: v_dual_mov_b32 v5, -1
17621; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
17622; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
17623; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
17624; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
17625; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4]
17626; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v1, v3, v1
17627; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, -1 :: v_dual_mov_b32 v3, -1
17628; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
17629; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
17630; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
17631; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[5:6]
17632; GFX1132_DPP-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v1, v5, v1
17633; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
17634; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v2, -1, -1
17635; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v1, -1, -1
17636; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17637; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
17638; GFX1132_DPP-NEXT:    v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
17639; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
17640; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4]
17641; GFX1132_DPP-NEXT:    v_dual_mov_b32 v5, -1 :: v_dual_cndmask_b32 v2, v4, v2
17642; GFX1132_DPP-NEXT:    v_dual_mov_b32 v4, -1 :: v_dual_cndmask_b32 v1, v3, v1
17643; GFX1132_DPP-NEXT:    v_readlane_b32 s3, v2, 15
17644; GFX1132_DPP-NEXT:    v_readlane_b32 s1, v2, 31
17645; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
17646; GFX1132_DPP-NEXT:    v_readlane_b32 s0, v1, 31
17647; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
17648; GFX1132_DPP-NEXT:    v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
17649; GFX1132_DPP-NEXT:    v_readlane_b32 s6, v1, 15
17650; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
17651; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
17652; GFX1132_DPP-NEXT:    v_mov_b32_e32 v0, 0
17653; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
17654; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s3, 16
17655; GFX1132_DPP-NEXT:    v_writelane_b32 v4, s6, 16
17656; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
17657; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
17658; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
17659; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
17660; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s3, vcc_lo
17661; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB32_2
17662; GFX1132_DPP-NEXT:  ; %bb.1:
17663; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
17664; GFX1132_DPP-NEXT:    ds_min_rtn_u64 v[7:8], v0, v[7:8]
17665; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17666; GFX1132_DPP-NEXT:    buffer_gl0_inv
17667; GFX1132_DPP-NEXT:  .LBB32_2:
17668; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
17669; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
17670; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s5, v8
17671; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
17672; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v4
17673; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
17674; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
17675; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
17676; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[7:8]
17677; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
17678; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
17679; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
17680; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
17681; GFX1132_DPP-NEXT:    s_endpgm
17682entry:
17683  %lane = call i32 @llvm.amdgcn.workitem.id.x()
17684  %lane_ext = zext i32 %lane to i64
17685  %old = atomicrmw umin ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel
17686  store i64 %old, ptr addrspace(1) %out
17687  ret void
17688}
17689