xref: /llvm-project/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll (revision 3277c7cd28154e33637a168acb26cea7ac1f7fff)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s
3; RUN: llc  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
7; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
8; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
9
10declare i1 @llvm.amdgcn.wqm.vote(i1)
11declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
12declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg)
13
14; Show what the atomic optimization pass will do for raw buffers.
15
16define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspace(8) inreg %inout) {
17; GFX7-LABEL: add_i32_constant:
18; GFX7:       ; %bb.0: ; %entry
19; GFX7-NEXT:    s_mov_b64 s[10:11], exec
20; GFX7-NEXT:    ; implicit-def: $vgpr0
21; GFX7-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
22; GFX7-NEXT:    s_cbranch_execz .LBB0_4
23; GFX7-NEXT:  ; %bb.1:
24; GFX7-NEXT:    s_mov_b64 s[12:13], exec
25; GFX7-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s12, 0
26; GFX7-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s13, v0
27; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
28; GFX7-NEXT:    ; implicit-def: $vgpr1
29; GFX7-NEXT:    s_and_saveexec_b64 s[10:11], vcc
30; GFX7-NEXT:    s_cbranch_execz .LBB0_3
31; GFX7-NEXT:  ; %bb.2:
32; GFX7-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
33; GFX7-NEXT:    s_mul_i32 s12, s12, 5
34; GFX7-NEXT:    v_mov_b32_e32 v1, s12
35; GFX7-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
36; GFX7-NEXT:  .LBB0_3:
37; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
38; GFX7-NEXT:    s_waitcnt vmcnt(0)
39; GFX7-NEXT:    v_readfirstlane_b32 s4, v1
40; GFX7-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
41; GFX7-NEXT:  .LBB0_4: ; %Flow
42; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
43; GFX7-NEXT:    s_wqm_b64 s[4:5], -1
44; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
45; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
46; GFX7-NEXT:    s_cbranch_vccnz .LBB0_6
47; GFX7-NEXT:  ; %bb.5: ; %if
48; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
49; GFX7-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
50; GFX7-NEXT:    s_endpgm
51;
52; GFX89-LABEL: add_i32_constant:
53; GFX89:       ; %bb.0: ; %entry
54; GFX89-NEXT:    s_mov_b64 s[10:11], exec
55; GFX89-NEXT:    ; implicit-def: $vgpr0
56; GFX89-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
57; GFX89-NEXT:    s_cbranch_execz .LBB0_4
58; GFX89-NEXT:  ; %bb.1:
59; GFX89-NEXT:    s_mov_b64 s[12:13], exec
60; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
61; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
62; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
63; GFX89-NEXT:    ; implicit-def: $vgpr1
64; GFX89-NEXT:    s_and_saveexec_b64 s[10:11], vcc
65; GFX89-NEXT:    s_cbranch_execz .LBB0_3
66; GFX89-NEXT:  ; %bb.2:
67; GFX89-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
68; GFX89-NEXT:    s_mul_i32 s12, s12, 5
69; GFX89-NEXT:    v_mov_b32_e32 v1, s12
70; GFX89-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
71; GFX89-NEXT:  .LBB0_3:
72; GFX89-NEXT:    s_or_b64 exec, exec, s[10:11]
73; GFX89-NEXT:    s_waitcnt vmcnt(0)
74; GFX89-NEXT:    v_readfirstlane_b32 s4, v1
75; GFX89-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
76; GFX89-NEXT:  .LBB0_4: ; %Flow
77; GFX89-NEXT:    s_or_b64 exec, exec, s[8:9]
78; GFX89-NEXT:    s_wqm_b64 s[4:5], -1
79; GFX89-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
80; GFX89-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
81; GFX89-NEXT:    s_cbranch_vccnz .LBB0_6
82; GFX89-NEXT:  ; %bb.5: ; %if
83; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
84; GFX89-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
85; GFX89-NEXT:    s_endpgm
86;
87; GFX1064-LABEL: add_i32_constant:
88; GFX1064:       ; %bb.0: ; %entry
89; GFX1064-NEXT:    s_mov_b64 s[10:11], exec
90; GFX1064-NEXT:    ; implicit-def: $vgpr0
91; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
92; GFX1064-NEXT:    s_cbranch_execz .LBB0_4
93; GFX1064-NEXT:  ; %bb.1:
94; GFX1064-NEXT:    s_mov_b64 s[12:13], exec
95; GFX1064-NEXT:    ; implicit-def: $vgpr1
96; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
97; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
98; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
99; GFX1064-NEXT:    s_and_saveexec_b64 s[10:11], vcc
100; GFX1064-NEXT:    s_cbranch_execz .LBB0_3
101; GFX1064-NEXT:  ; %bb.2:
102; GFX1064-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
103; GFX1064-NEXT:    s_mul_i32 s12, s12, 5
104; GFX1064-NEXT:    v_mov_b32_e32 v1, s12
105; GFX1064-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
106; GFX1064-NEXT:  .LBB0_3:
107; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
108; GFX1064-NEXT:    s_or_b64 exec, exec, s[10:11]
109; GFX1064-NEXT:    s_waitcnt vmcnt(0)
110; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
111; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
112; GFX1064-NEXT:  .LBB0_4: ; %Flow
113; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
114; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
115; GFX1064-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
116; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
117; GFX1064-NEXT:    s_cbranch_vccnz .LBB0_6
118; GFX1064-NEXT:  ; %bb.5: ; %if
119; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
120; GFX1064-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
121; GFX1064-NEXT:    s_endpgm
122;
123; GFX1032-LABEL: add_i32_constant:
124; GFX1032:       ; %bb.0: ; %entry
125; GFX1032-NEXT:    s_mov_b32 s9, exec_lo
126; GFX1032-NEXT:    ; implicit-def: $vgpr0
127; GFX1032-NEXT:    s_and_saveexec_b32 s8, s9
128; GFX1032-NEXT:    s_cbranch_execz .LBB0_4
129; GFX1032-NEXT:  ; %bb.1:
130; GFX1032-NEXT:    s_mov_b32 s10, exec_lo
131; GFX1032-NEXT:    ; implicit-def: $vgpr1
132; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s10, 0
133; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
134; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
135; GFX1032-NEXT:    s_cbranch_execz .LBB0_3
136; GFX1032-NEXT:  ; %bb.2:
137; GFX1032-NEXT:    s_bcnt1_i32_b32 s10, s10
138; GFX1032-NEXT:    s_mul_i32 s10, s10, 5
139; GFX1032-NEXT:    v_mov_b32_e32 v1, s10
140; GFX1032-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
141; GFX1032-NEXT:  .LBB0_3:
142; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
143; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
144; GFX1032-NEXT:    s_waitcnt vmcnt(0)
145; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
146; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
147; GFX1032-NEXT:  .LBB0_4: ; %Flow
148; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
149; GFX1032-NEXT:    s_wqm_b32 s4, -1
150; GFX1032-NEXT:    s_and_b32 s4, s4, s4
151; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
152; GFX1032-NEXT:    s_cbranch_vccnz .LBB0_6
153; GFX1032-NEXT:  ; %bb.5: ; %if
154; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
155; GFX1032-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
156; GFX1032-NEXT:    s_endpgm
157;
158; GFX1164-LABEL: add_i32_constant:
159; GFX1164:       ; %bb.0: ; %entry
160; GFX1164-NEXT:    s_mov_b64 s[10:11], exec
161; GFX1164-NEXT:    ; implicit-def: $vgpr0
162; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
163; GFX1164-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
164; GFX1164-NEXT:    s_cbranch_execz .LBB0_4
165; GFX1164-NEXT:  ; %bb.1:
166; GFX1164-NEXT:    s_mov_b64 s[12:13], exec
167; GFX1164-NEXT:    s_mov_b64 s[10:11], exec
168; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
169; GFX1164-NEXT:    ; implicit-def: $vgpr1
170; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
171; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
172; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
173; GFX1164-NEXT:    s_cbranch_execz .LBB0_3
174; GFX1164-NEXT:  ; %bb.2:
175; GFX1164-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
176; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
177; GFX1164-NEXT:    s_mul_i32 s12, s12, 5
178; GFX1164-NEXT:    v_mov_b32_e32 v1, s12
179; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
180; GFX1164-NEXT:  .LBB0_3:
181; GFX1164-NEXT:    s_or_b64 exec, exec, s[10:11]
182; GFX1164-NEXT:    s_waitcnt vmcnt(0)
183; GFX1164-NEXT:    v_readfirstlane_b32 s4, v1
184; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
185; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
186; GFX1164-NEXT:  .LBB0_4: ; %Flow
187; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
188; GFX1164-NEXT:    s_wqm_b64 s[4:5], -1
189; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
190; GFX1164-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
191; GFX1164-NEXT:    s_and_not1_b64 vcc, exec, s[4:5]
192; GFX1164-NEXT:    s_cbranch_vccnz .LBB0_6
193; GFX1164-NEXT:  ; %bb.5: ; %if
194; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
195; GFX1164-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
196; GFX1164-NEXT:    s_endpgm
197;
198; GFX1132-LABEL: add_i32_constant:
199; GFX1132:       ; %bb.0: ; %entry
200; GFX1132-NEXT:    s_mov_b32 s9, exec_lo
201; GFX1132-NEXT:    ; implicit-def: $vgpr0
202; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
203; GFX1132-NEXT:    s_and_saveexec_b32 s8, s9
204; GFX1132-NEXT:    s_cbranch_execz .LBB0_4
205; GFX1132-NEXT:  ; %bb.1:
206; GFX1132-NEXT:    s_mov_b32 s10, exec_lo
207; GFX1132-NEXT:    s_mov_b32 s9, exec_lo
208; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s10, 0
209; GFX1132-NEXT:    ; implicit-def: $vgpr1
210; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
211; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
212; GFX1132-NEXT:    s_cbranch_execz .LBB0_3
213; GFX1132-NEXT:  ; %bb.2:
214; GFX1132-NEXT:    s_bcnt1_i32_b32 s10, s10
215; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
216; GFX1132-NEXT:    s_mul_i32 s10, s10, 5
217; GFX1132-NEXT:    v_mov_b32_e32 v1, s10
218; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
219; GFX1132-NEXT:  .LBB0_3:
220; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
221; GFX1132-NEXT:    s_waitcnt vmcnt(0)
222; GFX1132-NEXT:    v_readfirstlane_b32 s4, v1
223; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
224; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
225; GFX1132-NEXT:  .LBB0_4: ; %Flow
226; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s8
227; GFX1132-NEXT:    s_wqm_b32 s4, -1
228; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
229; GFX1132-NEXT:    s_and_b32 s4, s4, s4
230; GFX1132-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
231; GFX1132-NEXT:    s_cbranch_vccnz .LBB0_6
232; GFX1132-NEXT:  ; %bb.5: ; %if
233; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
234; GFX1132-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
235; GFX1132-NEXT:    s_endpgm
236entry:
237  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
238  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
239  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
240  %cond = and i1 %cond1, %cond2
241  br i1 %cond, label %if, label %else
242if:
243  %bitcast = bitcast i32 %old to float
244  call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %bitcast, ptr addrspace(8) %out, i32 0, i32 0, i32 0)
245  ret void
246else:
247  ret void
248}
249
250define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspace(8) inreg %inout, i32 %val) {
251; GFX7-LABEL: add_i32_varying:
252; GFX7:       ; %bb.0: ; %entry
253; GFX7-NEXT:    s_wqm_b64 s[8:9], -1
254; GFX7-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
255; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
256; GFX7-NEXT:    s_cbranch_vccnz .LBB1_2
257; GFX7-NEXT:  ; %bb.1: ; %if
258; GFX7-NEXT:    s_waitcnt vmcnt(0)
259; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
260; GFX7-NEXT:  .LBB1_2: ; %else
261; GFX7-NEXT:    s_endpgm
262;
263; GFX8-LABEL: add_i32_varying:
264; GFX8:       ; %bb.0: ; %entry
265; GFX8-NEXT:    s_mov_b64 s[8:9], exec
266; GFX8-NEXT:    s_mov_b64 s[10:11], s[8:9]
267; GFX8-NEXT:    ; implicit-def: $vgpr3
268; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
269; GFX8-NEXT:    s_cbranch_execz .LBB1_4
270; GFX8-NEXT:  ; %bb.1:
271; GFX8-NEXT:    s_or_saveexec_b64 s[10:11], -1
272; GFX8-NEXT:    v_mov_b32_e32 v1, 0
273; GFX8-NEXT:    s_mov_b64 exec, s[10:11]
274; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
275; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
276; GFX8-NEXT:    s_or_saveexec_b64 s[10:11], -1
277; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[10:11]
278; GFX8-NEXT:    s_nop 1
279; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
280; GFX8-NEXT:    s_nop 1
281; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
282; GFX8-NEXT:    s_nop 1
283; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
284; GFX8-NEXT:    s_nop 1
285; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
286; GFX8-NEXT:    s_nop 1
287; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
288; GFX8-NEXT:    s_nop 1
289; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
290; GFX8-NEXT:    v_readlane_b32 s12, v2, 63
291; GFX8-NEXT:    s_nop 0
292; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
293; GFX8-NEXT:    s_mov_b64 exec, s[10:11]
294; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
295; GFX8-NEXT:    ; implicit-def: $vgpr0
296; GFX8-NEXT:    s_and_saveexec_b64 s[10:11], vcc
297; GFX8-NEXT:    s_cbranch_execz .LBB1_3
298; GFX8-NEXT:  ; %bb.2:
299; GFX8-NEXT:    v_mov_b32_e32 v0, s12
300; GFX8-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
301; GFX8-NEXT:  .LBB1_3:
302; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
303; GFX8-NEXT:    s_waitcnt vmcnt(0)
304; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
305; GFX8-NEXT:    v_mov_b32_e32 v0, v1
306; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v0
307; GFX8-NEXT:  .LBB1_4: ; %Flow
308; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
309; GFX8-NEXT:    s_wqm_b64 s[4:5], -1
310; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
311; GFX8-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
312; GFX8-NEXT:    s_cbranch_vccnz .LBB1_6
313; GFX8-NEXT:  ; %bb.5: ; %if
314; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], 0
315; GFX8-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
316; GFX8-NEXT:    s_endpgm
317;
318; GFX9-LABEL: add_i32_varying:
319; GFX9:       ; %bb.0: ; %entry
320; GFX9-NEXT:    s_mov_b64 s[8:9], exec
321; GFX9-NEXT:    s_mov_b64 s[10:11], s[8:9]
322; GFX9-NEXT:    ; implicit-def: $vgpr3
323; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
324; GFX9-NEXT:    s_cbranch_execz .LBB1_4
325; GFX9-NEXT:  ; %bb.1:
326; GFX9-NEXT:    s_or_saveexec_b64 s[10:11], -1
327; GFX9-NEXT:    v_mov_b32_e32 v1, 0
328; GFX9-NEXT:    s_mov_b64 exec, s[10:11]
329; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
330; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
331; GFX9-NEXT:    s_or_saveexec_b64 s[10:11], -1
332; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[10:11]
333; GFX9-NEXT:    s_nop 1
334; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
335; GFX9-NEXT:    s_nop 1
336; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
337; GFX9-NEXT:    s_nop 1
338; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
339; GFX9-NEXT:    s_nop 1
340; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
341; GFX9-NEXT:    s_nop 1
342; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
343; GFX9-NEXT:    s_nop 1
344; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
345; GFX9-NEXT:    v_readlane_b32 s12, v2, 63
346; GFX9-NEXT:    s_nop 0
347; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
348; GFX9-NEXT:    s_mov_b64 exec, s[10:11]
349; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
350; GFX9-NEXT:    ; implicit-def: $vgpr0
351; GFX9-NEXT:    s_and_saveexec_b64 s[10:11], vcc
352; GFX9-NEXT:    s_cbranch_execz .LBB1_3
353; GFX9-NEXT:  ; %bb.2:
354; GFX9-NEXT:    v_mov_b32_e32 v0, s12
355; GFX9-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
356; GFX9-NEXT:  .LBB1_3:
357; GFX9-NEXT:    s_or_b64 exec, exec, s[10:11]
358; GFX9-NEXT:    s_waitcnt vmcnt(0)
359; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
360; GFX9-NEXT:    v_mov_b32_e32 v0, v1
361; GFX9-NEXT:    v_add_u32_e32 v3, s4, v0
362; GFX9-NEXT:  .LBB1_4: ; %Flow
363; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
364; GFX9-NEXT:    s_wqm_b64 s[4:5], -1
365; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
366; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
367; GFX9-NEXT:    s_cbranch_vccnz .LBB1_6
368; GFX9-NEXT:  ; %bb.5: ; %if
369; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], 0
370; GFX9-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
371; GFX9-NEXT:    s_endpgm
372;
373; GFX1064-LABEL: add_i32_varying:
374; GFX1064:       ; %bb.0: ; %entry
375; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
376; GFX1064-NEXT:    ; implicit-def: $vgpr4
377; GFX1064-NEXT:    s_mov_b64 s[10:11], s[8:9]
378; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
379; GFX1064-NEXT:    s_cbranch_execz .LBB1_4
380; GFX1064-NEXT:  ; %bb.1:
381; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
382; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[10:11]
383; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
384; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
385; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
386; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
387; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
388; GFX1064-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
389; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
390; GFX1064-NEXT:    v_readlane_b32 s12, v1, 31
391; GFX1064-NEXT:    v_mov_b32_e32 v2, s12
392; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
393; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
394; GFX1064-NEXT:    v_readlane_b32 s12, v1, 15
395; GFX1064-NEXT:    v_readlane_b32 s13, v1, 31
396; GFX1064-NEXT:    v_writelane_b32 v3, s12, 16
397; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
398; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
399; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
400; GFX1064-NEXT:    v_readlane_b32 s12, v1, 63
401; GFX1064-NEXT:    v_readlane_b32 s14, v1, 47
402; GFX1064-NEXT:    v_writelane_b32 v3, s13, 32
403; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
404; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
405; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
406; GFX1064-NEXT:    v_writelane_b32 v3, s14, 48
407; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
408; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
409; GFX1064-NEXT:    ; implicit-def: $vgpr0
410; GFX1064-NEXT:    s_and_saveexec_b64 s[10:11], vcc
411; GFX1064-NEXT:    s_cbranch_execz .LBB1_3
412; GFX1064-NEXT:  ; %bb.2:
413; GFX1064-NEXT:    v_mov_b32_e32 v0, s12
414; GFX1064-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
415; GFX1064-NEXT:  .LBB1_3:
416; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
417; GFX1064-NEXT:    s_or_b64 exec, exec, s[10:11]
418; GFX1064-NEXT:    s_waitcnt vmcnt(0)
419; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
420; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
421; GFX1064-NEXT:    v_add_nc_u32_e32 v4, s4, v0
422; GFX1064-NEXT:  .LBB1_4: ; %Flow
423; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
424; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
425; GFX1064-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
426; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
427; GFX1064-NEXT:    s_cbranch_vccnz .LBB1_6
428; GFX1064-NEXT:  ; %bb.5: ; %if
429; GFX1064-NEXT:    buffer_store_dword v4, off, s[0:3], 0
430; GFX1064-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
431; GFX1064-NEXT:    s_endpgm
432;
433; GFX1032-LABEL: add_i32_varying:
434; GFX1032:       ; %bb.0: ; %entry
435; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
436; GFX1032-NEXT:    ; implicit-def: $vgpr4
437; GFX1032-NEXT:    s_mov_b32 s9, s8
438; GFX1032-NEXT:    s_and_saveexec_b32 s8, s9
439; GFX1032-NEXT:    s_cbranch_execz .LBB1_4
440; GFX1032-NEXT:  ; %bb.1:
441; GFX1032-NEXT:    s_or_saveexec_b32 s9, -1
442; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s9
443; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
444; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
445; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
446; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
447; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
448; GFX1032-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
449; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
450; GFX1032-NEXT:    v_readlane_b32 s11, v1, 31
451; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
452; GFX1032-NEXT:    v_readlane_b32 s10, v1, 15
453; GFX1032-NEXT:    s_mov_b32 exec_lo, s9
454; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
455; GFX1032-NEXT:    s_or_saveexec_b32 s9, -1
456; GFX1032-NEXT:    v_writelane_b32 v3, s10, 16
457; GFX1032-NEXT:    s_mov_b32 exec_lo, s9
458; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
459; GFX1032-NEXT:    ; implicit-def: $vgpr0
460; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
461; GFX1032-NEXT:    s_cbranch_execz .LBB1_3
462; GFX1032-NEXT:  ; %bb.2:
463; GFX1032-NEXT:    v_mov_b32_e32 v0, s11
464; GFX1032-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
465; GFX1032-NEXT:  .LBB1_3:
466; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
467; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
468; GFX1032-NEXT:    s_waitcnt vmcnt(0)
469; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
470; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
471; GFX1032-NEXT:    v_add_nc_u32_e32 v4, s4, v0
472; GFX1032-NEXT:  .LBB1_4: ; %Flow
473; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
474; GFX1032-NEXT:    s_wqm_b32 s4, -1
475; GFX1032-NEXT:    s_and_b32 s4, s4, s4
476; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
477; GFX1032-NEXT:    s_cbranch_vccnz .LBB1_6
478; GFX1032-NEXT:  ; %bb.5: ; %if
479; GFX1032-NEXT:    buffer_store_dword v4, off, s[0:3], 0
480; GFX1032-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
481; GFX1032-NEXT:    s_endpgm
482;
483; GFX1164-LABEL: add_i32_varying:
484; GFX1164:       ; %bb.0: ; %entry
485; GFX1164-NEXT:    s_mov_b64 s[8:9], exec
486; GFX1164-NEXT:    ; implicit-def: $vgpr4
487; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
488; GFX1164-NEXT:    s_mov_b64 s[10:11], s[8:9]
489; GFX1164-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
490; GFX1164-NEXT:    s_cbranch_execz .LBB1_4
491; GFX1164-NEXT:  ; %bb.1:
492; GFX1164-NEXT:    s_or_saveexec_b64 s[10:11], -1
493; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
494; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[10:11]
495; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
496; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
497; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
498; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
499; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
500; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
501; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
502; GFX1164-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
503; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
504; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
505; GFX1164-NEXT:    v_readlane_b32 s12, v1, 31
506; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
507; GFX1164-NEXT:    v_mov_b32_e32 v2, s12
508; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
509; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
510; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
511; GFX1164-NEXT:    v_readlane_b32 s12, v1, 15
512; GFX1164-NEXT:    v_readlane_b32 s13, v1, 31
513; GFX1164-NEXT:    v_writelane_b32 v3, s12, 16
514; GFX1164-NEXT:    s_mov_b64 exec, s[10:11]
515; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
516; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
517; GFX1164-NEXT:    s_or_saveexec_b64 s[10:11], -1
518; GFX1164-NEXT:    v_readlane_b32 s12, v1, 63
519; GFX1164-NEXT:    v_readlane_b32 s14, v1, 47
520; GFX1164-NEXT:    v_writelane_b32 v3, s13, 32
521; GFX1164-NEXT:    s_mov_b64 exec, s[10:11]
522; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
523; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
524; GFX1164-NEXT:    s_or_saveexec_b64 s[10:11], -1
525; GFX1164-NEXT:    v_writelane_b32 v3, s14, 48
526; GFX1164-NEXT:    s_mov_b64 exec, s[10:11]
527; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
528; GFX1164-NEXT:    ; implicit-def: $vgpr0
529; GFX1164-NEXT:    s_and_saveexec_b64 s[10:11], vcc
530; GFX1164-NEXT:    s_cbranch_execz .LBB1_3
531; GFX1164-NEXT:  ; %bb.2:
532; GFX1164-NEXT:    v_mov_b32_e32 v0, s12
533; GFX1164-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
534; GFX1164-NEXT:  .LBB1_3:
535; GFX1164-NEXT:    s_or_b64 exec, exec, s[10:11]
536; GFX1164-NEXT:    s_waitcnt vmcnt(0)
537; GFX1164-NEXT:    v_readfirstlane_b32 s4, v0
538; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
539; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
540; GFX1164-NEXT:    v_add_nc_u32_e32 v4, s4, v0
541; GFX1164-NEXT:  .LBB1_4: ; %Flow
542; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
543; GFX1164-NEXT:    s_wqm_b64 s[4:5], -1
544; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
545; GFX1164-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
546; GFX1164-NEXT:    s_and_not1_b64 vcc, exec, s[4:5]
547; GFX1164-NEXT:    s_cbranch_vccnz .LBB1_6
548; GFX1164-NEXT:  ; %bb.5: ; %if
549; GFX1164-NEXT:    buffer_store_b32 v4, off, s[0:3], 0
550; GFX1164-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
551; GFX1164-NEXT:    s_endpgm
552;
553; GFX1132-LABEL: add_i32_varying:
554; GFX1132:       ; %bb.0: ; %entry
555; GFX1132-NEXT:    s_mov_b32 s8, exec_lo
556; GFX1132-NEXT:    ; implicit-def: $vgpr4
557; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
558; GFX1132-NEXT:    s_mov_b32 s9, s8
559; GFX1132-NEXT:    s_and_saveexec_b32 s8, s9
560; GFX1132-NEXT:    s_cbranch_execz .LBB1_4
561; GFX1132-NEXT:  ; %bb.1:
562; GFX1132-NEXT:    s_or_saveexec_b32 s9, -1
563; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
564; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s9
565; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
566; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
567; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
568; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
569; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
570; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
571; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
572; GFX1132-NEXT:    v_permlanex16_b32 v2, v1, -1, -1
573; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
574; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
575; GFX1132-NEXT:    v_readlane_b32 s11, v1, 31
576; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
577; GFX1132-NEXT:    v_readlane_b32 s10, v1, 15
578; GFX1132-NEXT:    s_mov_b32 exec_lo, s9
579; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
580; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
581; GFX1132-NEXT:    s_or_saveexec_b32 s9, -1
582; GFX1132-NEXT:    v_writelane_b32 v3, s10, 16
583; GFX1132-NEXT:    s_mov_b32 exec_lo, s9
584; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
585; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
586; GFX1132-NEXT:    ; implicit-def: $vgpr0
587; GFX1132-NEXT:    s_and_saveexec_b32 s9, vcc_lo
588; GFX1132-NEXT:    s_cbranch_execz .LBB1_3
589; GFX1132-NEXT:  ; %bb.2:
590; GFX1132-NEXT:    v_mov_b32_e32 v0, s11
591; GFX1132-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
592; GFX1132-NEXT:  .LBB1_3:
593; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
594; GFX1132-NEXT:    s_waitcnt vmcnt(0)
595; GFX1132-NEXT:    v_readfirstlane_b32 s4, v0
596; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
597; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
598; GFX1132-NEXT:    v_add_nc_u32_e32 v4, s4, v0
599; GFX1132-NEXT:  .LBB1_4: ; %Flow
600; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s8
601; GFX1132-NEXT:    s_wqm_b32 s4, -1
602; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
603; GFX1132-NEXT:    s_and_b32 s4, s4, s4
604; GFX1132-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
605; GFX1132-NEXT:    s_cbranch_vccnz .LBB1_6
606; GFX1132-NEXT:  ; %bb.5: ; %if
607; GFX1132-NEXT:    buffer_store_b32 v4, off, s[0:3], 0
608; GFX1132-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
609; GFX1132-NEXT:    s_endpgm
610entry:
611  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
612  %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 %val, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
613  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
614  %cond = and i1 %cond1, %cond2
615  br i1 %cond, label %if, label %else
616if:
617  %bitcast = bitcast i32 %old to float
618  call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %bitcast, ptr addrspace(8) %out, i32 0, i32 0, i32 0)
619  ret void
620else:
621  ret void
622}
623