xref: /llvm-project/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (revision bfd9bc274586b0261e16e22ac50d50586a0152e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
7; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
8; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
9; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
10
11; --------------------------------------------------------------------
12; float
13; --------------------------------------------------------------------
14
15define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 {
16; GFX12-LABEL: flat_agent_atomic_fsub_ret_f32:
17; GFX12:       ; %bb.0:
18; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
19; GFX12-NEXT:    s_wait_expcnt 0x0
20; GFX12-NEXT:    s_wait_samplecnt 0x0
21; GFX12-NEXT:    s_wait_bvhcnt 0x0
22; GFX12-NEXT:    s_wait_kmcnt 0x0
23; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
24; GFX12-NEXT:    s_mov_b32 s0, 0
25; GFX12-NEXT:  .LBB0_1: ; %atomicrmw.start
26; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
27; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
28; GFX12-NEXT:    v_mov_b32_e32 v4, v3
29; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
30; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
31; GFX12-NEXT:    s_wait_storecnt 0x0
32; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
33; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
34; GFX12-NEXT:    global_inv scope:SCOPE_DEV
35; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
36; GFX12-NEXT:    s_wait_alu 0xfffe
37; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
38; GFX12-NEXT:    s_wait_alu 0xfffe
39; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
40; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
41; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
42; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
43; GFX12-NEXT:    v_mov_b32_e32 v0, v3
44; GFX12-NEXT:    s_wait_alu 0xfffe
45; GFX12-NEXT:    s_setpc_b64 s[30:31]
46;
47; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32:
48; GFX940:       ; %bb.0:
49; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50; GFX940-NEXT:    flat_load_dword v3, v[0:1]
51; GFX940-NEXT:    s_mov_b64 s[0:1], 0
52; GFX940-NEXT:  .LBB0_1: ; %atomicrmw.start
53; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
54; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
55; GFX940-NEXT:    v_mov_b32_e32 v5, v3
56; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
57; GFX940-NEXT:    buffer_wbl2 sc1
58; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
59; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
60; GFX940-NEXT:    buffer_inv sc1
61; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
62; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
63; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
64; GFX940-NEXT:    s_cbranch_execnz .LBB0_1
65; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
66; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
67; GFX940-NEXT:    v_mov_b32_e32 v0, v3
68; GFX940-NEXT:    s_setpc_b64 s[30:31]
69;
70; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32:
71; GFX11:       ; %bb.0:
72; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
74; GFX11-NEXT:    s_mov_b32 s0, 0
75; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
76; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
77; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
78; GFX11-NEXT:    v_mov_b32_e32 v4, v3
79; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
80; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
81; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
82; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
83; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
84; GFX11-NEXT:    buffer_gl1_inv
85; GFX11-NEXT:    buffer_gl0_inv
86; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
87; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
88; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
89; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
90; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
91; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
92; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
93; GFX11-NEXT:    v_mov_b32_e32 v0, v3
94; GFX11-NEXT:    s_setpc_b64 s[30:31]
95;
96; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32:
97; GFX10:       ; %bb.0:
98; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
99; GFX10-NEXT:    flat_load_dword v3, v[0:1]
100; GFX10-NEXT:    s_mov_b32 s4, 0
101; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
102; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
103; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
104; GFX10-NEXT:    v_mov_b32_e32 v4, v3
105; GFX10-NEXT:    v_sub_f32_e32 v3, v4, v2
106; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
107; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
108; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
109; GFX10-NEXT:    buffer_gl1_inv
110; GFX10-NEXT:    buffer_gl0_inv
111; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
112; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
113; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
114; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
115; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
116; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
117; GFX10-NEXT:    v_mov_b32_e32 v0, v3
118; GFX10-NEXT:    s_setpc_b64 s[30:31]
119;
120; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32:
121; GFX90A:       ; %bb.0:
122; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
124; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
125; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
126; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
127; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
128; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
129; GFX90A-NEXT:    v_sub_f32_e32 v4, v5, v2
130; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
131; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
132; GFX90A-NEXT:    buffer_wbinvl1
133; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
134; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
135; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
136; GFX90A-NEXT:    s_cbranch_execnz .LBB0_1
137; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
138; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
139; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
140; GFX90A-NEXT:    s_setpc_b64 s[30:31]
141;
142; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32:
143; GFX908:       ; %bb.0:
144; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
145; GFX908-NEXT:    flat_load_dword v3, v[0:1]
146; GFX908-NEXT:    s_mov_b64 s[4:5], 0
147; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
148; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
149; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
150; GFX908-NEXT:    v_mov_b32_e32 v4, v3
151; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v2
152; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
153; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
154; GFX908-NEXT:    buffer_wbinvl1
155; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
156; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
157; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
158; GFX908-NEXT:    s_cbranch_execnz .LBB0_1
159; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
160; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
161; GFX908-NEXT:    v_mov_b32_e32 v0, v3
162; GFX908-NEXT:    s_setpc_b64 s[30:31]
163;
164; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32:
165; GFX8:       ; %bb.0:
166; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
167; GFX8-NEXT:    flat_load_dword v3, v[0:1]
168; GFX8-NEXT:    s_mov_b64 s[4:5], 0
169; GFX8-NEXT:  .LBB0_1: ; %atomicrmw.start
170; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
171; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
172; GFX8-NEXT:    v_mov_b32_e32 v4, v3
173; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v2
174; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
175; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
176; GFX8-NEXT:    buffer_wbinvl1
177; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
178; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
179; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
180; GFX8-NEXT:    s_cbranch_execnz .LBB0_1
181; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
182; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
183; GFX8-NEXT:    v_mov_b32_e32 v0, v3
184; GFX8-NEXT:    s_setpc_b64 s[30:31]
185;
186; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32:
187; GFX7:       ; %bb.0:
188; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189; GFX7-NEXT:    flat_load_dword v3, v[0:1]
190; GFX7-NEXT:    s_mov_b64 s[4:5], 0
191; GFX7-NEXT:  .LBB0_1: ; %atomicrmw.start
192; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
193; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
194; GFX7-NEXT:    v_mov_b32_e32 v4, v3
195; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
196; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
197; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
198; GFX7-NEXT:    buffer_wbinvl1
199; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
200; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
201; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
202; GFX7-NEXT:    s_cbranch_execnz .LBB0_1
203; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
204; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
205; GFX7-NEXT:    v_mov_b32_e32 v0, v3
206; GFX7-NEXT:    s_setpc_b64 s[30:31]
207  %result = atomicrmw fsub ptr %ptr, float %val syncscope("agent") seq_cst
208  ret float %result
209}
210
211define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
212; GFX12-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos:
213; GFX12:       ; %bb.0:
214; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
215; GFX12-NEXT:    s_wait_expcnt 0x0
216; GFX12-NEXT:    s_wait_samplecnt 0x0
217; GFX12-NEXT:    s_wait_bvhcnt 0x0
218; GFX12-NEXT:    s_wait_kmcnt 0x0
219; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
220; GFX12-NEXT:    s_mov_b32 s0, 0
221; GFX12-NEXT:  .LBB1_1: ; %atomicrmw.start
222; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
223; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
224; GFX12-NEXT:    v_mov_b32_e32 v4, v3
225; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
226; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
227; GFX12-NEXT:    s_wait_storecnt 0x0
228; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
229; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
230; GFX12-NEXT:    global_inv scope:SCOPE_DEV
231; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
232; GFX12-NEXT:    s_wait_alu 0xfffe
233; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
234; GFX12-NEXT:    s_wait_alu 0xfffe
235; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
236; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
237; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
238; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
239; GFX12-NEXT:    v_mov_b32_e32 v0, v3
240; GFX12-NEXT:    s_wait_alu 0xfffe
241; GFX12-NEXT:    s_setpc_b64 s[30:31]
242;
243; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos:
244; GFX940:       ; %bb.0:
245; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
246; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
247; GFX940-NEXT:    s_mov_b64 s[0:1], 0
248; GFX940-NEXT:  .LBB1_1: ; %atomicrmw.start
249; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
250; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
251; GFX940-NEXT:    v_mov_b32_e32 v5, v3
252; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
253; GFX940-NEXT:    buffer_wbl2 sc1
254; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
255; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
256; GFX940-NEXT:    buffer_inv sc1
257; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
258; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
259; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
260; GFX940-NEXT:    s_cbranch_execnz .LBB1_1
261; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
262; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
263; GFX940-NEXT:    v_mov_b32_e32 v0, v3
264; GFX940-NEXT:    s_setpc_b64 s[30:31]
265;
266; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos:
267; GFX11:       ; %bb.0:
268; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
269; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
270; GFX11-NEXT:    s_mov_b32 s0, 0
271; GFX11-NEXT:  .LBB1_1: ; %atomicrmw.start
272; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
273; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
274; GFX11-NEXT:    v_mov_b32_e32 v4, v3
275; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
276; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
277; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
278; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
279; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
280; GFX11-NEXT:    buffer_gl1_inv
281; GFX11-NEXT:    buffer_gl0_inv
282; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
283; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
284; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
285; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
286; GFX11-NEXT:    s_cbranch_execnz .LBB1_1
287; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
288; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
289; GFX11-NEXT:    v_mov_b32_e32 v0, v3
290; GFX11-NEXT:    s_setpc_b64 s[30:31]
291;
292; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos:
293; GFX10:       ; %bb.0:
294; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
296; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
297; GFX10-NEXT:    s_mov_b32 s4, 0
298; GFX10-NEXT:    flat_load_dword v0, v[3:4]
299; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
300; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
301; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
302; GFX10-NEXT:    v_mov_b32_e32 v1, v0
303; GFX10-NEXT:    v_sub_f32_e32 v0, v1, v2
304; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
305; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
306; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
307; GFX10-NEXT:    buffer_gl1_inv
308; GFX10-NEXT:    buffer_gl0_inv
309; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
310; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
311; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
312; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
313; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
314; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
315; GFX10-NEXT:    s_setpc_b64 s[30:31]
316;
317; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos:
318; GFX90A:       ; %bb.0:
319; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
320; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
321; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
322; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
323; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
324; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
325; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
326; GFX90A-NEXT:    v_sub_f32_e32 v4, v5, v2
327; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
328; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
329; GFX90A-NEXT:    buffer_wbinvl1
330; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
331; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
332; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
333; GFX90A-NEXT:    s_cbranch_execnz .LBB1_1
334; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
335; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
336; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
337; GFX90A-NEXT:    s_setpc_b64 s[30:31]
338;
339; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos:
340; GFX908:       ; %bb.0:
341; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
343; GFX908-NEXT:    s_mov_b64 s[4:5], 0
344; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
345; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
346; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
347; GFX908-NEXT:    v_mov_b32_e32 v4, v3
348; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v2
349; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
350; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
351; GFX908-NEXT:    buffer_wbinvl1
352; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
353; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
354; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
355; GFX908-NEXT:    s_cbranch_execnz .LBB1_1
356; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
357; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
358; GFX908-NEXT:    v_mov_b32_e32 v0, v3
359; GFX908-NEXT:    s_setpc_b64 s[30:31]
360;
361; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos:
362; GFX8:       ; %bb.0:
363; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
365; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
366; GFX8-NEXT:    flat_load_dword v0, v[3:4]
367; GFX8-NEXT:    s_mov_b64 s[4:5], 0
368; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
369; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
370; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
371; GFX8-NEXT:    v_mov_b32_e32 v1, v0
372; GFX8-NEXT:    v_sub_f32_e32 v0, v1, v2
373; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
374; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
375; GFX8-NEXT:    buffer_wbinvl1
376; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
377; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
378; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
379; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
380; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
381; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
382; GFX8-NEXT:    s_setpc_b64 s[30:31]
383;
384; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos:
385; GFX7:       ; %bb.0:
386; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
387; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
388; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
389; GFX7-NEXT:    flat_load_dword v0, v[3:4]
390; GFX7-NEXT:    s_mov_b64 s[4:5], 0
391; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
392; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
393; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
394; GFX7-NEXT:    v_mov_b32_e32 v1, v0
395; GFX7-NEXT:    v_sub_f32_e32 v0, v1, v2
396; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
397; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
398; GFX7-NEXT:    buffer_wbinvl1
399; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
400; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
401; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
402; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
403; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
404; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
405; GFX7-NEXT:    s_setpc_b64 s[30:31]
406  %gep = getelementptr float, ptr %ptr, i64 511
407  %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst
408  ret float %result
409}
410
411define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val) #0 {
412; GFX12-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
413; GFX12:       ; %bb.0:
414; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
415; GFX12-NEXT:    s_wait_expcnt 0x0
416; GFX12-NEXT:    s_wait_samplecnt 0x0
417; GFX12-NEXT:    s_wait_bvhcnt 0x0
418; GFX12-NEXT:    s_wait_kmcnt 0x0
419; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
420; GFX12-NEXT:    s_mov_b32 s0, 0
421; GFX12-NEXT:  .LBB2_1: ; %atomicrmw.start
422; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
423; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
424; GFX12-NEXT:    v_mov_b32_e32 v4, v3
425; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
426; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
427; GFX12-NEXT:    s_wait_storecnt 0x0
428; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
429; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
430; GFX12-NEXT:    global_inv scope:SCOPE_DEV
431; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
432; GFX12-NEXT:    s_wait_alu 0xfffe
433; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
434; GFX12-NEXT:    s_wait_alu 0xfffe
435; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
436; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
437; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
438; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
439; GFX12-NEXT:    v_mov_b32_e32 v0, v3
440; GFX12-NEXT:    s_wait_alu 0xfffe
441; GFX12-NEXT:    s_setpc_b64 s[30:31]
442;
443; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
444; GFX940:       ; %bb.0:
445; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
446; GFX940-NEXT:    v_mov_b32_e32 v4, v0
447; GFX940-NEXT:    v_mov_b32_e32 v5, v1
448; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
449; GFX940-NEXT:    s_movk_i32 s0, 0xf800
450; GFX940-NEXT:    s_nop 0
451; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
452; GFX940-NEXT:    flat_load_dword v0, v[0:1]
453; GFX940-NEXT:    s_mov_b32 s1, -1
454; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
455; GFX940-NEXT:    s_mov_b64 s[0:1], 0
456; GFX940-NEXT:  .LBB2_1: ; %atomicrmw.start
457; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
458; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
459; GFX940-NEXT:    v_mov_b32_e32 v1, v0
460; GFX940-NEXT:    v_sub_f32_e32 v0, v1, v2
461; GFX940-NEXT:    buffer_wbl2 sc1
462; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
463; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
464; GFX940-NEXT:    buffer_inv sc1
465; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
466; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
467; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
468; GFX940-NEXT:    s_cbranch_execnz .LBB2_1
469; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
470; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
471; GFX940-NEXT:    s_setpc_b64 s[30:31]
472;
473; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
474; GFX11:       ; %bb.0:
475; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
476; GFX11-NEXT:    v_mov_b32_e32 v3, v0
477; GFX11-NEXT:    s_mov_b32 s0, 0
478; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
479; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
480; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
481; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
482; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
483; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
484; GFX11-NEXT:  .LBB2_1: ; %atomicrmw.start
485; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
486; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
487; GFX11-NEXT:    v_mov_b32_e32 v1, v0
488; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
489; GFX11-NEXT:    v_sub_f32_e32 v0, v1, v2
490; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
491; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
492; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
493; GFX11-NEXT:    buffer_gl1_inv
494; GFX11-NEXT:    buffer_gl0_inv
495; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
496; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
497; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
498; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
499; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
500; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
501; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
502; GFX11-NEXT:    s_setpc_b64 s[30:31]
503;
504; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
505; GFX10:       ; %bb.0:
506; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
507; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
508; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
509; GFX10-NEXT:    s_mov_b32 s4, 0
510; GFX10-NEXT:    flat_load_dword v0, v[3:4]
511; GFX10-NEXT:  .LBB2_1: ; %atomicrmw.start
512; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
513; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
514; GFX10-NEXT:    v_mov_b32_e32 v1, v0
515; GFX10-NEXT:    v_sub_f32_e32 v0, v1, v2
516; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
517; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
518; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
519; GFX10-NEXT:    buffer_gl1_inv
520; GFX10-NEXT:    buffer_gl0_inv
521; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
522; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
523; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
524; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
525; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
526; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
527; GFX10-NEXT:    s_setpc_b64 s[30:31]
528;
529; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
530; GFX90A:       ; %bb.0:
531; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
532; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
533; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
534; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
535; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
536; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
537; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
538; GFX90A-NEXT:  .LBB2_1: ; %atomicrmw.start
539; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
540; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
541; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
542; GFX90A-NEXT:    v_sub_f32_e32 v0, v1, v2
543; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
544; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
545; GFX90A-NEXT:    buffer_wbinvl1
546; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
547; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
548; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
549; GFX90A-NEXT:    s_cbranch_execnz .LBB2_1
550; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
551; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
552; GFX90A-NEXT:    s_setpc_b64 s[30:31]
553;
554; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
555; GFX908:       ; %bb.0:
556; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
557; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
558; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
559; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
560; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
561; GFX908-NEXT:    flat_load_dword v0, v[0:1]
562; GFX908-NEXT:    s_mov_b64 s[4:5], 0
563; GFX908-NEXT:  .LBB2_1: ; %atomicrmw.start
564; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
565; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
566; GFX908-NEXT:    v_mov_b32_e32 v1, v0
567; GFX908-NEXT:    v_sub_f32_e32 v0, v1, v2
568; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
569; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
570; GFX908-NEXT:    buffer_wbinvl1
571; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
572; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
573; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
574; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
575; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
576; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
577; GFX908-NEXT:    s_setpc_b64 s[30:31]
578;
579; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
580; GFX8:       ; %bb.0:
581; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
582; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
583; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
584; GFX8-NEXT:    flat_load_dword v0, v[3:4]
585; GFX8-NEXT:    s_mov_b64 s[4:5], 0
586; GFX8-NEXT:  .LBB2_1: ; %atomicrmw.start
587; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
588; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
589; GFX8-NEXT:    v_mov_b32_e32 v1, v0
590; GFX8-NEXT:    v_sub_f32_e32 v0, v1, v2
591; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
592; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
593; GFX8-NEXT:    buffer_wbinvl1
594; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
595; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
596; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
597; GFX8-NEXT:    s_cbranch_execnz .LBB2_1
598; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
599; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
600; GFX8-NEXT:    s_setpc_b64 s[30:31]
601;
602; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
603; GFX7:       ; %bb.0:
604; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
605; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
606; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
607; GFX7-NEXT:    flat_load_dword v0, v[3:4]
608; GFX7-NEXT:    s_mov_b64 s[4:5], 0
609; GFX7-NEXT:  .LBB2_1: ; %atomicrmw.start
610; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
611; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
612; GFX7-NEXT:    v_mov_b32_e32 v1, v0
613; GFX7-NEXT:    v_sub_f32_e32 v0, v1, v2
614; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
615; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
616; GFX7-NEXT:    buffer_wbinvl1
617; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
618; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
619; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
620; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
621; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
622; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
623; GFX7-NEXT:    s_setpc_b64 s[30:31]
624  %gep = getelementptr float, ptr %ptr, i64 -512
625  %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst
626  ret float %result
627}
628
629define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 {
630; GFX12-LABEL: flat_agent_atomic_fsub_noret_f32:
631; GFX12:       ; %bb.0:
632; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
633; GFX12-NEXT:    s_wait_expcnt 0x0
634; GFX12-NEXT:    s_wait_samplecnt 0x0
635; GFX12-NEXT:    s_wait_bvhcnt 0x0
636; GFX12-NEXT:    s_wait_kmcnt 0x0
637; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
638; GFX12-NEXT:    s_mov_b32 s0, 0
639; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
640; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
641; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
642; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
643; GFX12-NEXT:    s_wait_storecnt 0x0
644; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
645; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
646; GFX12-NEXT:    global_inv scope:SCOPE_DEV
647; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
648; GFX12-NEXT:    v_mov_b32_e32 v4, v3
649; GFX12-NEXT:    s_wait_alu 0xfffe
650; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
651; GFX12-NEXT:    s_wait_alu 0xfffe
652; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
653; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
654; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
655; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
656; GFX12-NEXT:    s_wait_alu 0xfffe
657; GFX12-NEXT:    s_setpc_b64 s[30:31]
658;
659; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32:
660; GFX940:       ; %bb.0:
661; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
662; GFX940-NEXT:    flat_load_dword v5, v[0:1]
663; GFX940-NEXT:    s_mov_b64 s[0:1], 0
664; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
665; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
666; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
667; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
668; GFX940-NEXT:    buffer_wbl2 sc1
669; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
670; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
671; GFX940-NEXT:    buffer_inv sc1
672; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
673; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
674; GFX940-NEXT:    v_mov_b32_e32 v5, v3
675; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
676; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
677; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
678; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
679; GFX940-NEXT:    s_setpc_b64 s[30:31]
680;
681; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32:
682; GFX11:       ; %bb.0:
683; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
684; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
685; GFX11-NEXT:    s_mov_b32 s0, 0
686; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
687; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
688; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
689; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
690; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
691; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
692; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
693; GFX11-NEXT:    buffer_gl1_inv
694; GFX11-NEXT:    buffer_gl0_inv
695; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
696; GFX11-NEXT:    v_mov_b32_e32 v4, v3
697; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
698; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
699; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
700; GFX11-NEXT:    s_cbranch_execnz .LBB3_1
701; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
702; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
703; GFX11-NEXT:    s_setpc_b64 s[30:31]
704;
705; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32:
706; GFX10:       ; %bb.0:
707; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
708; GFX10-NEXT:    flat_load_dword v4, v[0:1]
709; GFX10-NEXT:    s_mov_b32 s4, 0
710; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
711; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
712; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
713; GFX10-NEXT:    v_sub_f32_e32 v3, v4, v2
714; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
715; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
716; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
717; GFX10-NEXT:    buffer_gl1_inv
718; GFX10-NEXT:    buffer_gl0_inv
719; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
720; GFX10-NEXT:    v_mov_b32_e32 v4, v3
721; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
722; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
723; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
724; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
725; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
726; GFX10-NEXT:    s_setpc_b64 s[30:31]
727;
728; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32:
729; GFX90A:       ; %bb.0:
730; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
731; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
732; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
733; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
734; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
735; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
736; GFX90A-NEXT:    v_sub_f32_e32 v4, v5, v2
737; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
738; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
739; GFX90A-NEXT:    buffer_wbinvl1
740; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
741; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
742; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
743; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
744; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
745; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
746; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
747; GFX90A-NEXT:    s_setpc_b64 s[30:31]
748;
749; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32:
750; GFX908:       ; %bb.0:
751; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752; GFX908-NEXT:    flat_load_dword v4, v[0:1]
753; GFX908-NEXT:    s_mov_b64 s[4:5], 0
754; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
755; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
756; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
757; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v2
758; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
759; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
760; GFX908-NEXT:    buffer_wbinvl1
761; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
762; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
763; GFX908-NEXT:    v_mov_b32_e32 v4, v3
764; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
765; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
766; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
767; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
768; GFX908-NEXT:    s_setpc_b64 s[30:31]
769;
770; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32:
771; GFX8:       ; %bb.0:
772; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
773; GFX8-NEXT:    flat_load_dword v4, v[0:1]
774; GFX8-NEXT:    s_mov_b64 s[4:5], 0
775; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
776; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
777; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
778; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v2
779; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
780; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
781; GFX8-NEXT:    buffer_wbinvl1
782; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
783; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
784; GFX8-NEXT:    v_mov_b32_e32 v4, v3
785; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
786; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
787; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
788; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
789; GFX8-NEXT:    s_setpc_b64 s[30:31]
790;
791; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32:
792; GFX7:       ; %bb.0:
793; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
794; GFX7-NEXT:    flat_load_dword v4, v[0:1]
795; GFX7-NEXT:    s_mov_b64 s[4:5], 0
796; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
797; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
798; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
799; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
800; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
801; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
802; GFX7-NEXT:    buffer_wbinvl1
803; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
804; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
805; GFX7-NEXT:    v_mov_b32_e32 v4, v3
806; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
807; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
808; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
809; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
810; GFX7-NEXT:    s_setpc_b64 s[30:31]
811  %unused = atomicrmw fsub ptr %ptr, float %val syncscope("agent") seq_cst
812  ret void
813}
814
815define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
816; GFX12-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos:
817; GFX12:       ; %bb.0:
818; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
819; GFX12-NEXT:    s_wait_expcnt 0x0
820; GFX12-NEXT:    s_wait_samplecnt 0x0
821; GFX12-NEXT:    s_wait_bvhcnt 0x0
822; GFX12-NEXT:    s_wait_kmcnt 0x0
823; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
824; GFX12-NEXT:    s_mov_b32 s0, 0
825; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
826; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
827; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
828; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
829; GFX12-NEXT:    s_wait_storecnt 0x0
830; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
831; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
832; GFX12-NEXT:    global_inv scope:SCOPE_DEV
833; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
834; GFX12-NEXT:    v_mov_b32_e32 v4, v3
835; GFX12-NEXT:    s_wait_alu 0xfffe
836; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
837; GFX12-NEXT:    s_wait_alu 0xfffe
838; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
839; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
840; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
841; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
842; GFX12-NEXT:    s_wait_alu 0xfffe
843; GFX12-NEXT:    s_setpc_b64 s[30:31]
844;
845; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos:
846; GFX940:       ; %bb.0:
847; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
848; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
849; GFX940-NEXT:    s_mov_b64 s[0:1], 0
850; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
851; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
852; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
853; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
854; GFX940-NEXT:    buffer_wbl2 sc1
855; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
856; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
857; GFX940-NEXT:    buffer_inv sc1
858; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
859; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
860; GFX940-NEXT:    v_mov_b32_e32 v5, v3
861; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
862; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
863; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
864; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
865; GFX940-NEXT:    s_setpc_b64 s[30:31]
866;
867; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos:
868; GFX11:       ; %bb.0:
869; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
871; GFX11-NEXT:    s_mov_b32 s0, 0
872; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
873; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
874; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
875; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
876; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
877; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
878; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
879; GFX11-NEXT:    buffer_gl1_inv
880; GFX11-NEXT:    buffer_gl0_inv
881; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
882; GFX11-NEXT:    v_mov_b32_e32 v4, v3
883; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
884; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
885; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
886; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
887; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
888; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
889; GFX11-NEXT:    s_setpc_b64 s[30:31]
890;
891; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos:
892; GFX10:       ; %bb.0:
893; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
894; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
895; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
896; GFX10-NEXT:    s_mov_b32 s4, 0
897; GFX10-NEXT:    flat_load_dword v4, v[0:1]
898; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
899; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
900; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
901; GFX10-NEXT:    v_sub_f32_e32 v3, v4, v2
902; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
903; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
904; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
905; GFX10-NEXT:    buffer_gl1_inv
906; GFX10-NEXT:    buffer_gl0_inv
907; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
908; GFX10-NEXT:    v_mov_b32_e32 v4, v3
909; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
910; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
911; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
912; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
913; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
914; GFX10-NEXT:    s_setpc_b64 s[30:31]
915;
916; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos:
917; GFX90A:       ; %bb.0:
918; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
919; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
920; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
921; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
922; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
923; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
924; GFX90A-NEXT:    v_sub_f32_e32 v4, v5, v2
925; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
926; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
927; GFX90A-NEXT:    buffer_wbinvl1
928; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
929; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
930; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
931; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
932; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
933; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
934; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
935; GFX90A-NEXT:    s_setpc_b64 s[30:31]
936;
937; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos:
938; GFX908:       ; %bb.0:
939; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
940; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
941; GFX908-NEXT:    s_mov_b64 s[4:5], 0
942; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
943; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
944; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
945; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v2
946; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
947; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
948; GFX908-NEXT:    buffer_wbinvl1
949; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
950; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
951; GFX908-NEXT:    v_mov_b32_e32 v4, v3
952; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
953; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
954; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
955; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
956; GFX908-NEXT:    s_setpc_b64 s[30:31]
957;
958; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos:
959; GFX8:       ; %bb.0:
960; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
961; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
962; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
963; GFX8-NEXT:    flat_load_dword v4, v[0:1]
964; GFX8-NEXT:    s_mov_b64 s[4:5], 0
965; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
966; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
967; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
968; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v2
969; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
970; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
971; GFX8-NEXT:    buffer_wbinvl1
972; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
973; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
974; GFX8-NEXT:    v_mov_b32_e32 v4, v3
975; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
976; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
977; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
978; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
979; GFX8-NEXT:    s_setpc_b64 s[30:31]
980;
981; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos:
982; GFX7:       ; %bb.0:
983; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
984; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
985; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
986; GFX7-NEXT:    flat_load_dword v4, v[0:1]
987; GFX7-NEXT:    s_mov_b64 s[4:5], 0
988; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
989; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
990; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
991; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
992; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
993; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
994; GFX7-NEXT:    buffer_wbinvl1
995; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
996; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
997; GFX7-NEXT:    v_mov_b32_e32 v4, v3
998; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
999; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
1000; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1001; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1002; GFX7-NEXT:    s_setpc_b64 s[30:31]
1003  %gep = getelementptr float, ptr %ptr, i64 511
1004  %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst
1005  ret void
1006}
1007
1008define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %val) #0 {
1009; GFX12-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg:
1010; GFX12:       ; %bb.0:
1011; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1012; GFX12-NEXT:    s_wait_expcnt 0x0
1013; GFX12-NEXT:    s_wait_samplecnt 0x0
1014; GFX12-NEXT:    s_wait_bvhcnt 0x0
1015; GFX12-NEXT:    s_wait_kmcnt 0x0
1016; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:-2048
1017; GFX12-NEXT:    s_mov_b32 s0, 0
1018; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
1019; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1020; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1021; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
1022; GFX12-NEXT:    s_wait_storecnt 0x0
1023; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1024; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1025; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1026; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1027; GFX12-NEXT:    v_mov_b32_e32 v4, v3
1028; GFX12-NEXT:    s_wait_alu 0xfffe
1029; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
1030; GFX12-NEXT:    s_wait_alu 0xfffe
1031; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1032; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
1033; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1034; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1035; GFX12-NEXT:    s_wait_alu 0xfffe
1036; GFX12-NEXT:    s_setpc_b64 s[30:31]
1037;
1038; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg:
1039; GFX940:       ; %bb.0:
1040; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1041; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
1042; GFX940-NEXT:    s_movk_i32 s0, 0xf800
1043; GFX940-NEXT:    s_nop 0
1044; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
1045; GFX940-NEXT:    flat_load_dword v5, v[4:5]
1046; GFX940-NEXT:    s_mov_b32 s1, -1
1047; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
1048; GFX940-NEXT:    s_mov_b64 s[0:1], 0
1049; GFX940-NEXT:  .LBB5_1: ; %atomicrmw.start
1050; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1051; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1052; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
1053; GFX940-NEXT:    buffer_wbl2 sc1
1054; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
1055; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1056; GFX940-NEXT:    buffer_inv sc1
1057; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1058; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1059; GFX940-NEXT:    v_mov_b32_e32 v5, v3
1060; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1061; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
1062; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1063; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
1064; GFX940-NEXT:    s_setpc_b64 s[30:31]
1065;
1066; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg:
1067; GFX11:       ; %bb.0:
1068; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1069; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
1070; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
1071; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
1072; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
1073; GFX11-NEXT:    flat_load_b32 v4, v[3:4]
1074; GFX11-NEXT:    s_mov_b32 s0, 0
1075; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
1076; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1077; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1078; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
1079; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1080; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
1081; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1082; GFX11-NEXT:    buffer_gl1_inv
1083; GFX11-NEXT:    buffer_gl0_inv
1084; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1085; GFX11-NEXT:    v_mov_b32_e32 v4, v3
1086; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
1087; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1088; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1089; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
1090; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1091; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1092; GFX11-NEXT:    s_setpc_b64 s[30:31]
1093;
1094; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg:
1095; GFX10:       ; %bb.0:
1096; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1097; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
1098; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
1099; GFX10-NEXT:    s_mov_b32 s4, 0
1100; GFX10-NEXT:    flat_load_dword v4, v[0:1]
1101; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
1102; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1103; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1104; GFX10-NEXT:    v_sub_f32_e32 v3, v4, v2
1105; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1106; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1107; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1108; GFX10-NEXT:    buffer_gl1_inv
1109; GFX10-NEXT:    buffer_gl0_inv
1110; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1111; GFX10-NEXT:    v_mov_b32_e32 v4, v3
1112; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1113; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1114; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
1115; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1116; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1117; GFX10-NEXT:    s_setpc_b64 s[30:31]
1118;
1119; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg:
1120; GFX90A:       ; %bb.0:
1121; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1122; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
1123; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
1124; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
1125; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
1126; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
1127; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1128; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
1129; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1130; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1131; GFX90A-NEXT:    v_sub_f32_e32 v0, v1, v2
1132; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
1133; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1134; GFX90A-NEXT:    buffer_wbinvl1
1135; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
1136; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1137; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
1138; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1139; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
1140; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1141; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1142; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1143;
1144; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg:
1145; GFX908:       ; %bb.0:
1146; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1147; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
1148; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
1149; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
1150; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
1151; GFX908-NEXT:    flat_load_dword v1, v[0:1]
1152; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1153; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
1154; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1155; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1156; GFX908-NEXT:    v_sub_f32_e32 v0, v1, v2
1157; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
1158; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1159; GFX908-NEXT:    buffer_wbinvl1
1160; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
1161; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1162; GFX908-NEXT:    v_mov_b32_e32 v1, v0
1163; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1164; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
1165; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1166; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1167; GFX908-NEXT:    s_setpc_b64 s[30:31]
1168;
1169; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg:
1170; GFX8:       ; %bb.0:
1171; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1172; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
1173; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
1174; GFX8-NEXT:    flat_load_dword v4, v[0:1]
1175; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1176; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
1177; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1178; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1179; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v2
1180; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1181; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1182; GFX8-NEXT:    buffer_wbinvl1
1183; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1184; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1185; GFX8-NEXT:    v_mov_b32_e32 v4, v3
1186; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1187; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
1188; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1189; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1190; GFX8-NEXT:    s_setpc_b64 s[30:31]
1191;
1192; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg:
1193; GFX7:       ; %bb.0:
1194; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1195; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
1196; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
1197; GFX7-NEXT:    flat_load_dword v4, v[0:1]
1198; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1199; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
1200; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1201; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1202; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
1203; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1204; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1205; GFX7-NEXT:    buffer_wbinvl1
1206; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1207; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1208; GFX7-NEXT:    v_mov_b32_e32 v4, v3
1209; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1210; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
1211; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1212; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1213; GFX7-NEXT:    s_setpc_b64 s[30:31]
1214  %gep = getelementptr float, ptr %ptr, i64 -512
1215  %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst
1216  ret void
1217}
1218
1219define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
1220; GFX12-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos:
1221; GFX12:       ; %bb.0:
1222; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1223; GFX12-NEXT:    s_wait_expcnt 0x0
1224; GFX12-NEXT:    s_wait_samplecnt 0x0
1225; GFX12-NEXT:    s_wait_bvhcnt 0x0
1226; GFX12-NEXT:    s_wait_kmcnt 0x0
1227; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
1228; GFX12-NEXT:    s_mov_b32 s0, 0
1229; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
1230; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1231; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1232; GFX12-NEXT:    v_mov_b32_e32 v4, v3
1233; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1234; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
1235; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1236; GFX12-NEXT:    s_wait_storecnt 0x0
1237; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1238; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1239; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1240; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1241; GFX12-NEXT:    s_wait_alu 0xfffe
1242; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
1243; GFX12-NEXT:    s_wait_alu 0xfffe
1244; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1245; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
1246; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1247; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1248; GFX12-NEXT:    v_mov_b32_e32 v0, v3
1249; GFX12-NEXT:    s_wait_alu 0xfffe
1250; GFX12-NEXT:    s_setpc_b64 s[30:31]
1251;
1252; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos:
1253; GFX940:       ; %bb.0:
1254; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1255; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1256; GFX940-NEXT:    s_mov_b64 s[0:1], 0
1257; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
1258; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1259; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1260; GFX940-NEXT:    v_mov_b32_e32 v5, v3
1261; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
1262; GFX940-NEXT:    buffer_wbl2 sc0 sc1
1263; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
1264; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1265; GFX940-NEXT:    buffer_inv sc0 sc1
1266; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1267; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1268; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1269; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
1270; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1271; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
1272; GFX940-NEXT:    v_mov_b32_e32 v0, v3
1273; GFX940-NEXT:    s_setpc_b64 s[30:31]
1274;
1275; GFX11-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos:
1276; GFX11:       ; %bb.0:
1277; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1278; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
1279; GFX11-NEXT:    s_mov_b32 s0, 0
1280; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
1281; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1282; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1283; GFX11-NEXT:    v_mov_b32_e32 v4, v3
1284; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1285; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
1286; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1287; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
1288; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1289; GFX11-NEXT:    buffer_gl1_inv
1290; GFX11-NEXT:    buffer_gl0_inv
1291; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1292; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
1293; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1294; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1295; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
1296; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1297; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1298; GFX11-NEXT:    v_mov_b32_e32 v0, v3
1299; GFX11-NEXT:    s_setpc_b64 s[30:31]
1300;
1301; GFX10-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos:
1302; GFX10:       ; %bb.0:
1303; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1304; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
1305; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
1306; GFX10-NEXT:    s_mov_b32 s4, 0
1307; GFX10-NEXT:    flat_load_dword v0, v[3:4]
1308; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
1309; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1310; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1311; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1312; GFX10-NEXT:    v_sub_f32_e32 v0, v1, v2
1313; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1314; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
1315; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1316; GFX10-NEXT:    buffer_gl1_inv
1317; GFX10-NEXT:    buffer_gl0_inv
1318; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
1319; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1320; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1321; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
1322; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1323; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1324; GFX10-NEXT:    s_setpc_b64 s[30:31]
1325;
1326; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos:
1327; GFX90A:       ; %bb.0:
1328; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1329; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1330; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1331; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
1332; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1333; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1334; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
1335; GFX90A-NEXT:    v_sub_f32_e32 v4, v5, v2
1336; GFX90A-NEXT:    buffer_wbl2
1337; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
1338; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1339; GFX90A-NEXT:    buffer_invl2
1340; GFX90A-NEXT:    buffer_wbinvl1
1341; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1342; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1343; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1344; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
1345; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1346; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1347; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
1348; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1349;
1350; GFX908-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos:
1351; GFX908:       ; %bb.0:
1352; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1353; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1354; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1355; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
1356; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1357; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1358; GFX908-NEXT:    v_mov_b32_e32 v4, v3
1359; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v2
1360; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
1361; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1362; GFX908-NEXT:    buffer_wbinvl1
1363; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1364; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1365; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1366; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
1367; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1368; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1369; GFX908-NEXT:    v_mov_b32_e32 v0, v3
1370; GFX908-NEXT:    s_setpc_b64 s[30:31]
1371;
1372; GFX8-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos:
1373; GFX8:       ; %bb.0:
1374; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1375; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
1376; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
1377; GFX8-NEXT:    flat_load_dword v0, v[3:4]
1378; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1379; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
1380; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1381; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1382; GFX8-NEXT:    v_mov_b32_e32 v1, v0
1383; GFX8-NEXT:    v_sub_f32_e32 v0, v1, v2
1384; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
1385; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1386; GFX8-NEXT:    buffer_wbinvl1
1387; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
1388; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1389; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1390; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
1391; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1392; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1393; GFX8-NEXT:    s_setpc_b64 s[30:31]
1394;
1395; GFX7-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos:
1396; GFX7:       ; %bb.0:
1397; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1398; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
1399; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
1400; GFX7-NEXT:    flat_load_dword v0, v[3:4]
1401; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1402; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
1403; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1404; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1405; GFX7-NEXT:    v_mov_b32_e32 v1, v0
1406; GFX7-NEXT:    v_sub_f32_e32 v0, v1, v2
1407; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
1408; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1409; GFX7-NEXT:    buffer_wbinvl1
1410; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
1411; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1412; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1413; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
1414; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1415; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1416; GFX7-NEXT:    s_setpc_b64 s[30:31]
1417  %gep = getelementptr float, ptr %ptr, i64 511
1418  %result = atomicrmw fsub ptr %gep, float %val seq_cst
1419  ret float %result
1420}
1421
1422define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
1423; GFX12-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos:
1424; GFX12:       ; %bb.0:
1425; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1426; GFX12-NEXT:    s_wait_expcnt 0x0
1427; GFX12-NEXT:    s_wait_samplecnt 0x0
1428; GFX12-NEXT:    s_wait_bvhcnt 0x0
1429; GFX12-NEXT:    s_wait_kmcnt 0x0
1430; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
1431; GFX12-NEXT:    s_mov_b32 s0, 0
1432; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
1433; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1434; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1435; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
1436; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1437; GFX12-NEXT:    s_wait_storecnt 0x0
1438; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1439; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1440; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1441; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1442; GFX12-NEXT:    v_mov_b32_e32 v4, v3
1443; GFX12-NEXT:    s_wait_alu 0xfffe
1444; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
1445; GFX12-NEXT:    s_wait_alu 0xfffe
1446; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1447; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
1448; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1449; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1450; GFX12-NEXT:    s_wait_alu 0xfffe
1451; GFX12-NEXT:    s_setpc_b64 s[30:31]
1452;
1453; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos:
1454; GFX940:       ; %bb.0:
1455; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1456; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
1457; GFX940-NEXT:    s_mov_b64 s[0:1], 0
1458; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
1459; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1460; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1461; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
1462; GFX940-NEXT:    buffer_wbl2 sc0 sc1
1463; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
1464; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1465; GFX940-NEXT:    buffer_inv sc0 sc1
1466; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1467; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1468; GFX940-NEXT:    v_mov_b32_e32 v5, v3
1469; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1470; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
1471; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1472; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
1473; GFX940-NEXT:    s_setpc_b64 s[30:31]
1474;
1475; GFX11-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos:
1476; GFX11:       ; %bb.0:
1477; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1478; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
1479; GFX11-NEXT:    s_mov_b32 s0, 0
1480; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
1481; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1482; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1483; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
1484; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1485; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
1486; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1487; GFX11-NEXT:    buffer_gl1_inv
1488; GFX11-NEXT:    buffer_gl0_inv
1489; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1490; GFX11-NEXT:    v_mov_b32_e32 v4, v3
1491; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
1492; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1493; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1494; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
1495; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1496; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1497; GFX11-NEXT:    s_setpc_b64 s[30:31]
1498;
1499; GFX10-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos:
1500; GFX10:       ; %bb.0:
1501; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1502; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
1503; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1504; GFX10-NEXT:    s_mov_b32 s4, 0
1505; GFX10-NEXT:    flat_load_dword v4, v[0:1]
1506; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
1507; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1508; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1509; GFX10-NEXT:    v_sub_f32_e32 v3, v4, v2
1510; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1511; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1512; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1513; GFX10-NEXT:    buffer_gl1_inv
1514; GFX10-NEXT:    buffer_gl0_inv
1515; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1516; GFX10-NEXT:    v_mov_b32_e32 v4, v3
1517; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1518; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1519; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
1520; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1521; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1522; GFX10-NEXT:    s_setpc_b64 s[30:31]
1523;
1524; GFX90A-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos:
1525; GFX90A:       ; %bb.0:
1526; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1527; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
1528; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1529; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
1530; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1531; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1532; GFX90A-NEXT:    v_sub_f32_e32 v4, v5, v2
1533; GFX90A-NEXT:    buffer_wbl2
1534; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
1535; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1536; GFX90A-NEXT:    buffer_invl2
1537; GFX90A-NEXT:    buffer_wbinvl1
1538; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1539; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1540; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
1541; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1542; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
1543; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1544; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1545; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1546;
1547; GFX908-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos:
1548; GFX908:       ; %bb.0:
1549; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1550; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
1551; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1552; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
1553; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1554; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1555; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v2
1556; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
1557; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1558; GFX908-NEXT:    buffer_wbinvl1
1559; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1560; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1561; GFX908-NEXT:    v_mov_b32_e32 v4, v3
1562; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1563; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
1564; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1565; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1566; GFX908-NEXT:    s_setpc_b64 s[30:31]
1567;
1568; GFX8-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos:
1569; GFX8:       ; %bb.0:
1570; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1571; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
1572; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1573; GFX8-NEXT:    flat_load_dword v4, v[0:1]
1574; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1575; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
1576; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1577; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1578; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v2
1579; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1580; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1581; GFX8-NEXT:    buffer_wbinvl1
1582; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1583; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1584; GFX8-NEXT:    v_mov_b32_e32 v4, v3
1585; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1586; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
1587; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1588; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1589; GFX8-NEXT:    s_setpc_b64 s[30:31]
1590;
1591; GFX7-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos:
1592; GFX7:       ; %bb.0:
1593; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1594; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
1595; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1596; GFX7-NEXT:    flat_load_dword v4, v[0:1]
1597; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1598; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
1599; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1600; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1601; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
1602; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1603; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1604; GFX7-NEXT:    buffer_wbinvl1
1605; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1606; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1607; GFX7-NEXT:    v_mov_b32_e32 v4, v3
1608; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1609; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
1610; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1611; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1612; GFX7-NEXT:    s_setpc_b64 s[30:31]
1613  %gep = getelementptr float, ptr %ptr, i64 511
1614  %unused = atomicrmw fsub ptr %gep, float %val seq_cst
1615  ret void
1616}
1617
1618; --------------------------------------------------------------------
1619; float with ftz/daz
1620; --------------------------------------------------------------------
1621
1622define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 {
1623; GFX12-LABEL: flat_agent_atomic_fsub_ret_f32__ftz:
1624; GFX12:       ; %bb.0:
1625; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1626; GFX12-NEXT:    s_wait_expcnt 0x0
1627; GFX12-NEXT:    s_wait_samplecnt 0x0
1628; GFX12-NEXT:    s_wait_bvhcnt 0x0
1629; GFX12-NEXT:    s_wait_kmcnt 0x0
1630; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
1631; GFX12-NEXT:    s_mov_b32 s0, 0
1632; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
1633; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1634; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1635; GFX12-NEXT:    v_mov_b32_e32 v4, v3
1636; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1637; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
1638; GFX12-NEXT:    s_wait_storecnt 0x0
1639; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1640; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1641; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1642; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1643; GFX12-NEXT:    s_wait_alu 0xfffe
1644; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
1645; GFX12-NEXT:    s_wait_alu 0xfffe
1646; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1647; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
1648; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1649; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1650; GFX12-NEXT:    v_mov_b32_e32 v0, v3
1651; GFX12-NEXT:    s_wait_alu 0xfffe
1652; GFX12-NEXT:    s_setpc_b64 s[30:31]
1653;
1654; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__ftz:
1655; GFX940:       ; %bb.0:
1656; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1657; GFX940-NEXT:    flat_load_dword v3, v[0:1]
1658; GFX940-NEXT:    s_mov_b64 s[0:1], 0
1659; GFX940-NEXT:  .LBB8_1: ; %atomicrmw.start
1660; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1661; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1662; GFX940-NEXT:    v_mov_b32_e32 v5, v3
1663; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
1664; GFX940-NEXT:    buffer_wbl2 sc1
1665; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
1666; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1667; GFX940-NEXT:    buffer_inv sc1
1668; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1669; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1670; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1671; GFX940-NEXT:    s_cbranch_execnz .LBB8_1
1672; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1673; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
1674; GFX940-NEXT:    v_mov_b32_e32 v0, v3
1675; GFX940-NEXT:    s_setpc_b64 s[30:31]
1676;
1677; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__ftz:
1678; GFX11:       ; %bb.0:
1679; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1680; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
1681; GFX11-NEXT:    s_mov_b32 s0, 0
1682; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
1683; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1684; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1685; GFX11-NEXT:    v_mov_b32_e32 v4, v3
1686; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1687; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
1688; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1689; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
1690; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1691; GFX11-NEXT:    buffer_gl1_inv
1692; GFX11-NEXT:    buffer_gl0_inv
1693; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1694; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
1695; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1696; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1697; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
1698; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1699; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1700; GFX11-NEXT:    v_mov_b32_e32 v0, v3
1701; GFX11-NEXT:    s_setpc_b64 s[30:31]
1702;
1703; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32__ftz:
1704; GFX10:       ; %bb.0:
1705; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1706; GFX10-NEXT:    flat_load_dword v3, v[0:1]
1707; GFX10-NEXT:    s_mov_b32 s4, 0
1708; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
1709; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1710; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1711; GFX10-NEXT:    v_mov_b32_e32 v4, v3
1712; GFX10-NEXT:    v_sub_f32_e32 v3, v4, v2
1713; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1714; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1715; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1716; GFX10-NEXT:    buffer_gl1_inv
1717; GFX10-NEXT:    buffer_gl0_inv
1718; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1719; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1720; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1721; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
1722; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1723; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1724; GFX10-NEXT:    v_mov_b32_e32 v0, v3
1725; GFX10-NEXT:    s_setpc_b64 s[30:31]
1726;
1727; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__ftz:
1728; GFX90A:       ; %bb.0:
1729; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1730; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
1731; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1732; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
1733; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1734; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1735; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
1736; GFX90A-NEXT:    v_sub_f32_e32 v4, v5, v2
1737; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
1738; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1739; GFX90A-NEXT:    buffer_wbinvl1
1740; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1741; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1742; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1743; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
1744; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1745; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1746; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
1747; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1748;
1749; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32__ftz:
1750; GFX908:       ; %bb.0:
1751; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1752; GFX908-NEXT:    flat_load_dword v3, v[0:1]
1753; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1754; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
1755; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1756; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1757; GFX908-NEXT:    v_mov_b32_e32 v4, v3
1758; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v2
1759; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1760; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1761; GFX908-NEXT:    buffer_wbinvl1
1762; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1763; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1764; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1765; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
1766; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1767; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1768; GFX908-NEXT:    v_mov_b32_e32 v0, v3
1769; GFX908-NEXT:    s_setpc_b64 s[30:31]
1770;
1771; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32__ftz:
1772; GFX8:       ; %bb.0:
1773; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1774; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1775; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1776; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
1777; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1778; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1779; GFX8-NEXT:    v_mov_b32_e32 v4, v3
1780; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v2
1781; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1782; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1783; GFX8-NEXT:    buffer_wbinvl1
1784; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1785; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1786; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1787; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
1788; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1789; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1790; GFX8-NEXT:    v_mov_b32_e32 v0, v3
1791; GFX8-NEXT:    s_setpc_b64 s[30:31]
1792;
1793; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__ftz:
1794; GFX7:       ; %bb.0:
1795; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1796; GFX7-NEXT:    flat_load_dword v3, v[0:1]
1797; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1798; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
1799; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1800; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1801; GFX7-NEXT:    v_mov_b32_e32 v4, v3
1802; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
1803; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1804; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1805; GFX7-NEXT:    buffer_wbinvl1
1806; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1807; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1808; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1809; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
1810; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1811; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1812; GFX7-NEXT:    v_mov_b32_e32 v0, v3
1813; GFX7-NEXT:    s_setpc_b64 s[30:31]
1814  %result = atomicrmw fsub ptr %ptr, float %val syncscope("agent") seq_cst
1815  ret float %result
1816}
1817
1818define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
1819; GFX12-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
1820; GFX12:       ; %bb.0:
1821; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1822; GFX12-NEXT:    s_wait_expcnt 0x0
1823; GFX12-NEXT:    s_wait_samplecnt 0x0
1824; GFX12-NEXT:    s_wait_bvhcnt 0x0
1825; GFX12-NEXT:    s_wait_kmcnt 0x0
1826; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
1827; GFX12-NEXT:    s_mov_b32 s0, 0
1828; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
1829; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1830; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1831; GFX12-NEXT:    v_mov_b32_e32 v4, v3
1832; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1833; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
1834; GFX12-NEXT:    s_wait_storecnt 0x0
1835; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1836; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1837; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1838; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1839; GFX12-NEXT:    s_wait_alu 0xfffe
1840; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
1841; GFX12-NEXT:    s_wait_alu 0xfffe
1842; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1843; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
1844; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1845; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1846; GFX12-NEXT:    v_mov_b32_e32 v0, v3
1847; GFX12-NEXT:    s_wait_alu 0xfffe
1848; GFX12-NEXT:    s_setpc_b64 s[30:31]
1849;
1850; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
1851; GFX940:       ; %bb.0:
1852; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1853; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1854; GFX940-NEXT:    s_mov_b64 s[0:1], 0
1855; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
1856; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1857; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1858; GFX940-NEXT:    v_mov_b32_e32 v5, v3
1859; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
1860; GFX940-NEXT:    buffer_wbl2 sc1
1861; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
1862; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1863; GFX940-NEXT:    buffer_inv sc1
1864; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1865; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1866; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1867; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
1868; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1869; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
1870; GFX940-NEXT:    v_mov_b32_e32 v0, v3
1871; GFX940-NEXT:    s_setpc_b64 s[30:31]
1872;
1873; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
1874; GFX11:       ; %bb.0:
1875; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1876; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
1877; GFX11-NEXT:    s_mov_b32 s0, 0
1878; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
1879; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1880; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1881; GFX11-NEXT:    v_mov_b32_e32 v4, v3
1882; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1883; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
1884; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1885; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
1886; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1887; GFX11-NEXT:    buffer_gl1_inv
1888; GFX11-NEXT:    buffer_gl0_inv
1889; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1890; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
1891; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1892; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1893; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
1894; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1895; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1896; GFX11-NEXT:    v_mov_b32_e32 v0, v3
1897; GFX11-NEXT:    s_setpc_b64 s[30:31]
1898;
1899; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
1900; GFX10:       ; %bb.0:
1901; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1902; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
1903; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
1904; GFX10-NEXT:    s_mov_b32 s4, 0
1905; GFX10-NEXT:    flat_load_dword v0, v[3:4]
1906; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
1907; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1908; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1909; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1910; GFX10-NEXT:    v_sub_f32_e32 v0, v1, v2
1911; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1912; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
1913; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1914; GFX10-NEXT:    buffer_gl1_inv
1915; GFX10-NEXT:    buffer_gl0_inv
1916; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
1917; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1918; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1919; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
1920; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1921; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1922; GFX10-NEXT:    s_setpc_b64 s[30:31]
1923;
1924; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
1925; GFX90A:       ; %bb.0:
1926; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1927; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1928; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1929; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
1930; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1931; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1932; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
1933; GFX90A-NEXT:    v_sub_f32_e32 v4, v5, v2
1934; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
1935; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1936; GFX90A-NEXT:    buffer_wbinvl1
1937; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1938; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1939; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1940; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
1941; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1942; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1943; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
1944; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1945;
1946; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
1947; GFX908:       ; %bb.0:
1948; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1949; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1950; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1951; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
1952; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1953; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1954; GFX908-NEXT:    v_mov_b32_e32 v4, v3
1955; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v2
1956; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
1957; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1958; GFX908-NEXT:    buffer_wbinvl1
1959; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1960; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1961; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1962; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
1963; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1964; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1965; GFX908-NEXT:    v_mov_b32_e32 v0, v3
1966; GFX908-NEXT:    s_setpc_b64 s[30:31]
1967;
1968; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
1969; GFX8:       ; %bb.0:
1970; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1971; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
1972; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
1973; GFX8-NEXT:    flat_load_dword v0, v[3:4]
1974; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1975; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
1976; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1977; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1978; GFX8-NEXT:    v_mov_b32_e32 v1, v0
1979; GFX8-NEXT:    v_sub_f32_e32 v0, v1, v2
1980; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
1981; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1982; GFX8-NEXT:    buffer_wbinvl1
1983; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
1984; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1985; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1986; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
1987; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1988; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1989; GFX8-NEXT:    s_setpc_b64 s[30:31]
1990;
1991; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
1992; GFX7:       ; %bb.0:
1993; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1994; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
1995; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
1996; GFX7-NEXT:    flat_load_dword v0, v[3:4]
1997; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1998; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
1999; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2000; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2001; GFX7-NEXT:    v_mov_b32_e32 v1, v0
2002; GFX7-NEXT:    v_sub_f32_e32 v0, v1, v2
2003; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2004; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2005; GFX7-NEXT:    buffer_wbinvl1
2006; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2007; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2008; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2009; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
2010; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2011; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2012; GFX7-NEXT:    s_setpc_b64 s[30:31]
2013  %gep = getelementptr float, ptr %ptr, i64 511
2014  %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst
2015  ret float %result
2016}
2017
2018define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float %val) #1 {
2019; GFX12-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
2020; GFX12:       ; %bb.0:
2021; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2022; GFX12-NEXT:    s_wait_expcnt 0x0
2023; GFX12-NEXT:    s_wait_samplecnt 0x0
2024; GFX12-NEXT:    s_wait_bvhcnt 0x0
2025; GFX12-NEXT:    s_wait_kmcnt 0x0
2026; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
2027; GFX12-NEXT:    s_mov_b32 s0, 0
2028; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
2029; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2030; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2031; GFX12-NEXT:    v_mov_b32_e32 v4, v3
2032; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2033; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
2034; GFX12-NEXT:    s_wait_storecnt 0x0
2035; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2036; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2037; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2038; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2039; GFX12-NEXT:    s_wait_alu 0xfffe
2040; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
2041; GFX12-NEXT:    s_wait_alu 0xfffe
2042; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
2043; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
2044; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2045; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2046; GFX12-NEXT:    v_mov_b32_e32 v0, v3
2047; GFX12-NEXT:    s_wait_alu 0xfffe
2048; GFX12-NEXT:    s_setpc_b64 s[30:31]
2049;
2050; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
2051; GFX940:       ; %bb.0:
2052; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2053; GFX940-NEXT:    v_mov_b32_e32 v4, v0
2054; GFX940-NEXT:    v_mov_b32_e32 v5, v1
2055; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
2056; GFX940-NEXT:    s_movk_i32 s0, 0xf800
2057; GFX940-NEXT:    s_nop 0
2058; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
2059; GFX940-NEXT:    flat_load_dword v0, v[0:1]
2060; GFX940-NEXT:    s_mov_b32 s1, -1
2061; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
2062; GFX940-NEXT:    s_mov_b64 s[0:1], 0
2063; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
2064; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
2065; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2066; GFX940-NEXT:    v_mov_b32_e32 v1, v0
2067; GFX940-NEXT:    v_sub_f32_e32 v0, v1, v2
2068; GFX940-NEXT:    buffer_wbl2 sc1
2069; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
2070; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2071; GFX940-NEXT:    buffer_inv sc1
2072; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2073; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2074; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2075; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
2076; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
2077; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
2078; GFX940-NEXT:    s_setpc_b64 s[30:31]
2079;
2080; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
2081; GFX11:       ; %bb.0:
2082; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2083; GFX11-NEXT:    v_mov_b32_e32 v3, v0
2084; GFX11-NEXT:    s_mov_b32 s0, 0
2085; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2086; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
2087; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
2088; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
2089; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
2090; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
2091; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
2092; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2093; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2094; GFX11-NEXT:    v_mov_b32_e32 v1, v0
2095; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2096; GFX11-NEXT:    v_sub_f32_e32 v0, v1, v2
2097; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2098; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
2099; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2100; GFX11-NEXT:    buffer_gl1_inv
2101; GFX11-NEXT:    buffer_gl0_inv
2102; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
2103; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
2104; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2105; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
2106; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
2107; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2108; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2109; GFX11-NEXT:    s_setpc_b64 s[30:31]
2110;
2111; GFX10-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
2112; GFX10:       ; %bb.0:
2113; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2114; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
2115; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
2116; GFX10-NEXT:    s_mov_b32 s4, 0
2117; GFX10-NEXT:    flat_load_dword v0, v[3:4]
2118; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
2119; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2120; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2121; GFX10-NEXT:    v_mov_b32_e32 v1, v0
2122; GFX10-NEXT:    v_sub_f32_e32 v0, v1, v2
2123; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2124; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2125; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2126; GFX10-NEXT:    buffer_gl1_inv
2127; GFX10-NEXT:    buffer_gl0_inv
2128; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
2129; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
2130; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
2131; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
2132; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2133; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2134; GFX10-NEXT:    s_setpc_b64 s[30:31]
2135;
2136; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
2137; GFX90A:       ; %bb.0:
2138; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2139; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
2140; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
2141; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
2142; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
2143; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
2144; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
2145; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
2146; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
2147; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2148; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
2149; GFX90A-NEXT:    v_sub_f32_e32 v0, v1, v2
2150; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
2151; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2152; GFX90A-NEXT:    buffer_wbinvl1
2153; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2154; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2155; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2156; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
2157; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
2158; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2159; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2160;
2161; GFX908-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
2162; GFX908:       ; %bb.0:
2163; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2164; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
2165; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
2166; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
2167; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
2168; GFX908-NEXT:    flat_load_dword v0, v[0:1]
2169; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2170; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
2171; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2172; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2173; GFX908-NEXT:    v_mov_b32_e32 v1, v0
2174; GFX908-NEXT:    v_sub_f32_e32 v0, v1, v2
2175; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2176; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2177; GFX908-NEXT:    buffer_wbinvl1
2178; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2179; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2180; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2181; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
2182; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2183; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2184; GFX908-NEXT:    s_setpc_b64 s[30:31]
2185;
2186; GFX8-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
2187; GFX8:       ; %bb.0:
2188; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2189; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
2190; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
2191; GFX8-NEXT:    flat_load_dword v0, v[3:4]
2192; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2193; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
2194; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2195; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2196; GFX8-NEXT:    v_mov_b32_e32 v1, v0
2197; GFX8-NEXT:    v_sub_f32_e32 v0, v1, v2
2198; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2199; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2200; GFX8-NEXT:    buffer_wbinvl1
2201; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2202; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2203; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2204; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
2205; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2206; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2207; GFX8-NEXT:    s_setpc_b64 s[30:31]
2208;
2209; GFX7-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
2210; GFX7:       ; %bb.0:
2211; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2212; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
2213; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
2214; GFX7-NEXT:    flat_load_dword v0, v[3:4]
2215; GFX7-NEXT:    s_mov_b64 s[4:5], 0
2216; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
2217; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2218; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2219; GFX7-NEXT:    v_mov_b32_e32 v1, v0
2220; GFX7-NEXT:    v_sub_f32_e32 v0, v1, v2
2221; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2222; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2223; GFX7-NEXT:    buffer_wbinvl1
2224; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2225; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2226; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2227; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
2228; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2229; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2230; GFX7-NEXT:    s_setpc_b64 s[30:31]
2231  %gep = getelementptr float, ptr %ptr, i64 -512
2232  %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst
2233  ret float %result
2234}
2235
2236define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 {
2237; GFX12-LABEL: flat_agent_atomic_fsub_noret_f32__ftz:
2238; GFX12:       ; %bb.0:
2239; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2240; GFX12-NEXT:    s_wait_expcnt 0x0
2241; GFX12-NEXT:    s_wait_samplecnt 0x0
2242; GFX12-NEXT:    s_wait_bvhcnt 0x0
2243; GFX12-NEXT:    s_wait_kmcnt 0x0
2244; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
2245; GFX12-NEXT:    s_mov_b32 s0, 0
2246; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
2247; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2248; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2249; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
2250; GFX12-NEXT:    s_wait_storecnt 0x0
2251; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2252; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2253; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2254; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2255; GFX12-NEXT:    v_mov_b32_e32 v4, v3
2256; GFX12-NEXT:    s_wait_alu 0xfffe
2257; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
2258; GFX12-NEXT:    s_wait_alu 0xfffe
2259; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
2260; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
2261; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2262; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2263; GFX12-NEXT:    s_wait_alu 0xfffe
2264; GFX12-NEXT:    s_setpc_b64 s[30:31]
2265;
2266; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__ftz:
2267; GFX940:       ; %bb.0:
2268; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2269; GFX940-NEXT:    flat_load_dword v5, v[0:1]
2270; GFX940-NEXT:    s_mov_b64 s[0:1], 0
2271; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
2272; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
2273; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2274; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
2275; GFX940-NEXT:    buffer_wbl2 sc1
2276; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
2277; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2278; GFX940-NEXT:    buffer_inv sc1
2279; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
2280; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2281; GFX940-NEXT:    v_mov_b32_e32 v5, v3
2282; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2283; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
2284; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
2285; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
2286; GFX940-NEXT:    s_setpc_b64 s[30:31]
2287;
2288; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__ftz:
2289; GFX11:       ; %bb.0:
2290; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2291; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
2292; GFX11-NEXT:    s_mov_b32 s0, 0
2293; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
2294; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2295; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2296; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
2297; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2298; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
2299; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2300; GFX11-NEXT:    buffer_gl1_inv
2301; GFX11-NEXT:    buffer_gl0_inv
2302; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2303; GFX11-NEXT:    v_mov_b32_e32 v4, v3
2304; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
2305; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2306; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
2307; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
2308; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2309; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2310; GFX11-NEXT:    s_setpc_b64 s[30:31]
2311;
2312; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__ftz:
2313; GFX10:       ; %bb.0:
2314; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2315; GFX10-NEXT:    flat_load_dword v4, v[0:1]
2316; GFX10-NEXT:    s_mov_b32 s4, 0
2317; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
2318; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2319; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2320; GFX10-NEXT:    v_sub_f32_e32 v3, v4, v2
2321; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2322; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2323; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2324; GFX10-NEXT:    buffer_gl1_inv
2325; GFX10-NEXT:    buffer_gl0_inv
2326; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2327; GFX10-NEXT:    v_mov_b32_e32 v4, v3
2328; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
2329; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
2330; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
2331; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2332; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2333; GFX10-NEXT:    s_setpc_b64 s[30:31]
2334;
2335; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__ftz:
2336; GFX90A:       ; %bb.0:
2337; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2338; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
2339; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
2340; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
2341; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
2342; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2343; GFX90A-NEXT:    v_sub_f32_e32 v4, v5, v2
2344; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
2345; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2346; GFX90A-NEXT:    buffer_wbinvl1
2347; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
2348; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2349; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
2350; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2351; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
2352; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
2353; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2354; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2355;
2356; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__ftz:
2357; GFX908:       ; %bb.0:
2358; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2359; GFX908-NEXT:    flat_load_dword v4, v[0:1]
2360; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2361; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
2362; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2363; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2364; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v2
2365; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2366; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2367; GFX908-NEXT:    buffer_wbinvl1
2368; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2369; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2370; GFX908-NEXT:    v_mov_b32_e32 v4, v3
2371; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2372; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
2373; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2374; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2375; GFX908-NEXT:    s_setpc_b64 s[30:31]
2376;
2377; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__ftz:
2378; GFX8:       ; %bb.0:
2379; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2380; GFX8-NEXT:    flat_load_dword v4, v[0:1]
2381; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2382; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
2383; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2384; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2385; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v2
2386; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2387; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2388; GFX8-NEXT:    buffer_wbinvl1
2389; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2390; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2391; GFX8-NEXT:    v_mov_b32_e32 v4, v3
2392; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2393; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
2394; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2395; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2396; GFX8-NEXT:    s_setpc_b64 s[30:31]
2397;
2398; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__ftz:
2399; GFX7:       ; %bb.0:
2400; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2401; GFX7-NEXT:    flat_load_dword v4, v[0:1]
2402; GFX7-NEXT:    s_mov_b64 s[4:5], 0
2403; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
2404; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2405; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2406; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
2407; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2408; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2409; GFX7-NEXT:    buffer_wbinvl1
2410; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2411; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2412; GFX7-NEXT:    v_mov_b32_e32 v4, v3
2413; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2414; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
2415; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2416; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2417; GFX7-NEXT:    s_setpc_b64 s[30:31]
2418  %unused = atomicrmw fsub ptr %ptr, float %val syncscope("agent") seq_cst
2419  ret void
2420}
2421
2422define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
2423; GFX12-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
2424; GFX12:       ; %bb.0:
2425; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2426; GFX12-NEXT:    s_wait_expcnt 0x0
2427; GFX12-NEXT:    s_wait_samplecnt 0x0
2428; GFX12-NEXT:    s_wait_bvhcnt 0x0
2429; GFX12-NEXT:    s_wait_kmcnt 0x0
2430; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
2431; GFX12-NEXT:    s_mov_b32 s0, 0
2432; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
2433; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2434; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2435; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
2436; GFX12-NEXT:    s_wait_storecnt 0x0
2437; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2438; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2439; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2440; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2441; GFX12-NEXT:    v_mov_b32_e32 v4, v3
2442; GFX12-NEXT:    s_wait_alu 0xfffe
2443; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
2444; GFX12-NEXT:    s_wait_alu 0xfffe
2445; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
2446; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
2447; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2448; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2449; GFX12-NEXT:    s_wait_alu 0xfffe
2450; GFX12-NEXT:    s_setpc_b64 s[30:31]
2451;
2452; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
2453; GFX940:       ; %bb.0:
2454; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2455; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
2456; GFX940-NEXT:    s_mov_b64 s[0:1], 0
2457; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
2458; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
2459; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2460; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
2461; GFX940-NEXT:    buffer_wbl2 sc1
2462; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
2463; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2464; GFX940-NEXT:    buffer_inv sc1
2465; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
2466; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2467; GFX940-NEXT:    v_mov_b32_e32 v5, v3
2468; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2469; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
2470; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
2471; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
2472; GFX940-NEXT:    s_setpc_b64 s[30:31]
2473;
2474; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
2475; GFX11:       ; %bb.0:
2476; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2477; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
2478; GFX11-NEXT:    s_mov_b32 s0, 0
2479; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
2480; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2481; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2482; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
2483; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2484; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
2485; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2486; GFX11-NEXT:    buffer_gl1_inv
2487; GFX11-NEXT:    buffer_gl0_inv
2488; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2489; GFX11-NEXT:    v_mov_b32_e32 v4, v3
2490; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
2491; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2492; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
2493; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
2494; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2495; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2496; GFX11-NEXT:    s_setpc_b64 s[30:31]
2497;
2498; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
2499; GFX10:       ; %bb.0:
2500; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2501; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
2502; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2503; GFX10-NEXT:    s_mov_b32 s4, 0
2504; GFX10-NEXT:    flat_load_dword v4, v[0:1]
2505; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
2506; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2507; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2508; GFX10-NEXT:    v_sub_f32_e32 v3, v4, v2
2509; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2510; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2511; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2512; GFX10-NEXT:    buffer_gl1_inv
2513; GFX10-NEXT:    buffer_gl0_inv
2514; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2515; GFX10-NEXT:    v_mov_b32_e32 v4, v3
2516; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
2517; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
2518; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
2519; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2520; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2521; GFX10-NEXT:    s_setpc_b64 s[30:31]
2522;
2523; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
2524; GFX90A:       ; %bb.0:
2525; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2526; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
2527; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
2528; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
2529; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
2530; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2531; GFX90A-NEXT:    v_sub_f32_e32 v4, v5, v2
2532; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
2533; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2534; GFX90A-NEXT:    buffer_wbinvl1
2535; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
2536; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2537; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
2538; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2539; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
2540; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
2541; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2542; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2543;
2544; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
2545; GFX908:       ; %bb.0:
2546; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2547; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
2548; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2549; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
2550; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2551; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2552; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v2
2553; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
2554; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2555; GFX908-NEXT:    buffer_wbinvl1
2556; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2557; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2558; GFX908-NEXT:    v_mov_b32_e32 v4, v3
2559; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2560; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
2561; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2562; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2563; GFX908-NEXT:    s_setpc_b64 s[30:31]
2564;
2565; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
2566; GFX8:       ; %bb.0:
2567; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2568; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
2569; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2570; GFX8-NEXT:    flat_load_dword v4, v[0:1]
2571; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2572; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
2573; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2574; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2575; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v2
2576; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2577; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2578; GFX8-NEXT:    buffer_wbinvl1
2579; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2580; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2581; GFX8-NEXT:    v_mov_b32_e32 v4, v3
2582; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2583; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
2584; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2585; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2586; GFX8-NEXT:    s_setpc_b64 s[30:31]
2587;
2588; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
2589; GFX7:       ; %bb.0:
2590; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2591; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
2592; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2593; GFX7-NEXT:    flat_load_dword v4, v[0:1]
2594; GFX7-NEXT:    s_mov_b64 s[4:5], 0
2595; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
2596; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2597; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2598; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
2599; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2600; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2601; GFX7-NEXT:    buffer_wbinvl1
2602; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2603; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2604; GFX7-NEXT:    v_mov_b32_e32 v4, v3
2605; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2606; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
2607; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2608; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2609; GFX7-NEXT:    s_setpc_b64 s[30:31]
2610  %gep = getelementptr float, ptr %ptr, i64 511
2611  %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst
2612  ret void
2613}
2614
2615define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, float %val) #1 {
2616; GFX12-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
2617; GFX12:       ; %bb.0:
2618; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2619; GFX12-NEXT:    s_wait_expcnt 0x0
2620; GFX12-NEXT:    s_wait_samplecnt 0x0
2621; GFX12-NEXT:    s_wait_bvhcnt 0x0
2622; GFX12-NEXT:    s_wait_kmcnt 0x0
2623; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:-2048
2624; GFX12-NEXT:    s_mov_b32 s0, 0
2625; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
2626; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2627; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2628; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
2629; GFX12-NEXT:    s_wait_storecnt 0x0
2630; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2631; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2632; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2633; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2634; GFX12-NEXT:    v_mov_b32_e32 v4, v3
2635; GFX12-NEXT:    s_wait_alu 0xfffe
2636; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
2637; GFX12-NEXT:    s_wait_alu 0xfffe
2638; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
2639; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
2640; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2641; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2642; GFX12-NEXT:    s_wait_alu 0xfffe
2643; GFX12-NEXT:    s_setpc_b64 s[30:31]
2644;
2645; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
2646; GFX940:       ; %bb.0:
2647; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2648; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
2649; GFX940-NEXT:    s_movk_i32 s0, 0xf800
2650; GFX940-NEXT:    s_nop 0
2651; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
2652; GFX940-NEXT:    flat_load_dword v5, v[4:5]
2653; GFX940-NEXT:    s_mov_b32 s1, -1
2654; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
2655; GFX940-NEXT:    s_mov_b64 s[0:1], 0
2656; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
2657; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
2658; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2659; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
2660; GFX940-NEXT:    buffer_wbl2 sc1
2661; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
2662; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2663; GFX940-NEXT:    buffer_inv sc1
2664; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
2665; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2666; GFX940-NEXT:    v_mov_b32_e32 v5, v3
2667; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2668; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
2669; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
2670; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
2671; GFX940-NEXT:    s_setpc_b64 s[30:31]
2672;
2673; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
2674; GFX11:       ; %bb.0:
2675; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2676; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
2677; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
2678; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
2679; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
2680; GFX11-NEXT:    flat_load_b32 v4, v[3:4]
2681; GFX11-NEXT:    s_mov_b32 s0, 0
2682; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
2683; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2684; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2685; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
2686; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2687; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
2688; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2689; GFX11-NEXT:    buffer_gl1_inv
2690; GFX11-NEXT:    buffer_gl0_inv
2691; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2692; GFX11-NEXT:    v_mov_b32_e32 v4, v3
2693; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
2694; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2695; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
2696; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
2697; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2698; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2699; GFX11-NEXT:    s_setpc_b64 s[30:31]
2700;
2701; GFX10-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
2702; GFX10:       ; %bb.0:
2703; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2704; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
2705; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
2706; GFX10-NEXT:    s_mov_b32 s4, 0
2707; GFX10-NEXT:    flat_load_dword v4, v[0:1]
2708; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
2709; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2710; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2711; GFX10-NEXT:    v_sub_f32_e32 v3, v4, v2
2712; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2713; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2714; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2715; GFX10-NEXT:    buffer_gl1_inv
2716; GFX10-NEXT:    buffer_gl0_inv
2717; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2718; GFX10-NEXT:    v_mov_b32_e32 v4, v3
2719; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
2720; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
2721; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
2722; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2723; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2724; GFX10-NEXT:    s_setpc_b64 s[30:31]
2725;
2726; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
2727; GFX90A:       ; %bb.0:
2728; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2729; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
2730; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
2731; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
2732; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
2733; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
2734; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
2735; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
2736; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
2737; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2738; GFX90A-NEXT:    v_sub_f32_e32 v0, v1, v2
2739; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
2740; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2741; GFX90A-NEXT:    buffer_wbinvl1
2742; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2743; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2744; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
2745; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2746; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
2747; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
2748; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2749; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2750;
2751; GFX908-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
2752; GFX908:       ; %bb.0:
2753; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2754; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
2755; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
2756; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
2757; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
2758; GFX908-NEXT:    flat_load_dword v1, v[0:1]
2759; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2760; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
2761; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2762; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2763; GFX908-NEXT:    v_sub_f32_e32 v0, v1, v2
2764; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2765; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2766; GFX908-NEXT:    buffer_wbinvl1
2767; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2768; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2769; GFX908-NEXT:    v_mov_b32_e32 v1, v0
2770; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2771; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
2772; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2773; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2774; GFX908-NEXT:    s_setpc_b64 s[30:31]
2775;
2776; GFX8-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
2777; GFX8:       ; %bb.0:
2778; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2779; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
2780; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
2781; GFX8-NEXT:    flat_load_dword v4, v[0:1]
2782; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2783; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
2784; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2785; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2786; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v2
2787; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2788; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2789; GFX8-NEXT:    buffer_wbinvl1
2790; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2791; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2792; GFX8-NEXT:    v_mov_b32_e32 v4, v3
2793; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2794; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
2795; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2796; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2797; GFX8-NEXT:    s_setpc_b64 s[30:31]
2798;
2799; GFX7-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
2800; GFX7:       ; %bb.0:
2801; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2802; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
2803; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
2804; GFX7-NEXT:    flat_load_dword v4, v[0:1]
2805; GFX7-NEXT:    s_mov_b64 s[4:5], 0
2806; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
2807; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2808; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2809; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
2810; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2811; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2812; GFX7-NEXT:    buffer_wbinvl1
2813; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2814; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2815; GFX7-NEXT:    v_mov_b32_e32 v4, v3
2816; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2817; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
2818; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2819; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2820; GFX7-NEXT:    s_setpc_b64 s[30:31]
2821  %gep = getelementptr float, ptr %ptr, i64 -512
2822  %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst
2823  ret void
2824}
2825
2826define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
2827; GFX12-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
2828; GFX12:       ; %bb.0:
2829; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2830; GFX12-NEXT:    s_wait_expcnt 0x0
2831; GFX12-NEXT:    s_wait_samplecnt 0x0
2832; GFX12-NEXT:    s_wait_bvhcnt 0x0
2833; GFX12-NEXT:    s_wait_kmcnt 0x0
2834; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
2835; GFX12-NEXT:    s_mov_b32 s0, 0
2836; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
2837; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2838; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2839; GFX12-NEXT:    v_mov_b32_e32 v4, v3
2840; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2841; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
2842; GFX12-NEXT:    global_wb scope:SCOPE_SYS
2843; GFX12-NEXT:    s_wait_storecnt 0x0
2844; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2845; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2846; GFX12-NEXT:    global_inv scope:SCOPE_SYS
2847; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2848; GFX12-NEXT:    s_wait_alu 0xfffe
2849; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
2850; GFX12-NEXT:    s_wait_alu 0xfffe
2851; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
2852; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
2853; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2854; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2855; GFX12-NEXT:    v_mov_b32_e32 v0, v3
2856; GFX12-NEXT:    s_wait_alu 0xfffe
2857; GFX12-NEXT:    s_setpc_b64 s[30:31]
2858;
2859; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
2860; GFX940:       ; %bb.0:
2861; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2862; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
2863; GFX940-NEXT:    s_mov_b64 s[0:1], 0
2864; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
2865; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
2866; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2867; GFX940-NEXT:    v_mov_b32_e32 v5, v3
2868; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
2869; GFX940-NEXT:    buffer_wbl2 sc0 sc1
2870; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
2871; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2872; GFX940-NEXT:    buffer_inv sc0 sc1
2873; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
2874; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2875; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2876; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
2877; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
2878; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
2879; GFX940-NEXT:    v_mov_b32_e32 v0, v3
2880; GFX940-NEXT:    s_setpc_b64 s[30:31]
2881;
2882; GFX11-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
2883; GFX11:       ; %bb.0:
2884; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2885; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
2886; GFX11-NEXT:    s_mov_b32 s0, 0
2887; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
2888; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2889; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2890; GFX11-NEXT:    v_mov_b32_e32 v4, v3
2891; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2892; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
2893; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2894; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
2895; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2896; GFX11-NEXT:    buffer_gl1_inv
2897; GFX11-NEXT:    buffer_gl0_inv
2898; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2899; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
2900; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2901; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
2902; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
2903; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2904; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2905; GFX11-NEXT:    v_mov_b32_e32 v0, v3
2906; GFX11-NEXT:    s_setpc_b64 s[30:31]
2907;
2908; GFX10-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
2909; GFX10:       ; %bb.0:
2910; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2911; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
2912; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
2913; GFX10-NEXT:    s_mov_b32 s4, 0
2914; GFX10-NEXT:    flat_load_dword v0, v[3:4]
2915; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
2916; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2917; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2918; GFX10-NEXT:    v_mov_b32_e32 v1, v0
2919; GFX10-NEXT:    v_sub_f32_e32 v0, v1, v2
2920; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2921; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2922; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2923; GFX10-NEXT:    buffer_gl1_inv
2924; GFX10-NEXT:    buffer_gl0_inv
2925; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
2926; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
2927; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
2928; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
2929; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2930; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2931; GFX10-NEXT:    s_setpc_b64 s[30:31]
2932;
2933; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
2934; GFX90A:       ; %bb.0:
2935; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2936; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
2937; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
2938; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
2939; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
2940; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2941; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
2942; GFX90A-NEXT:    v_sub_f32_e32 v4, v5, v2
2943; GFX90A-NEXT:    buffer_wbl2
2944; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
2945; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2946; GFX90A-NEXT:    buffer_invl2
2947; GFX90A-NEXT:    buffer_wbinvl1
2948; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
2949; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2950; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2951; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
2952; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
2953; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2954; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
2955; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2956;
2957; GFX908-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
2958; GFX908:       ; %bb.0:
2959; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2960; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
2961; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2962; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
2963; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2964; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2965; GFX908-NEXT:    v_mov_b32_e32 v4, v3
2966; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v2
2967; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
2968; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2969; GFX908-NEXT:    buffer_wbinvl1
2970; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2971; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2972; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2973; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
2974; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2975; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2976; GFX908-NEXT:    v_mov_b32_e32 v0, v3
2977; GFX908-NEXT:    s_setpc_b64 s[30:31]
2978;
2979; GFX8-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
2980; GFX8:       ; %bb.0:
2981; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2982; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
2983; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
2984; GFX8-NEXT:    flat_load_dword v0, v[3:4]
2985; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2986; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
2987; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2988; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2989; GFX8-NEXT:    v_mov_b32_e32 v1, v0
2990; GFX8-NEXT:    v_sub_f32_e32 v0, v1, v2
2991; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2992; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2993; GFX8-NEXT:    buffer_wbinvl1
2994; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2995; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2996; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2997; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
2998; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2999; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3000; GFX8-NEXT:    s_setpc_b64 s[30:31]
3001;
3002; GFX7-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
3003; GFX7:       ; %bb.0:
3004; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3005; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
3006; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
3007; GFX7-NEXT:    flat_load_dword v0, v[3:4]
3008; GFX7-NEXT:    s_mov_b64 s[4:5], 0
3009; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
3010; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
3011; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3012; GFX7-NEXT:    v_mov_b32_e32 v1, v0
3013; GFX7-NEXT:    v_sub_f32_e32 v0, v1, v2
3014; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
3015; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3016; GFX7-NEXT:    buffer_wbinvl1
3017; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
3018; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3019; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3020; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
3021; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
3022; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3023; GFX7-NEXT:    s_setpc_b64 s[30:31]
3024  %gep = getelementptr float, ptr %ptr, i64 511
3025  %result = atomicrmw fsub ptr %gep, float %val seq_cst
3026  ret float %result
3027}
3028
3029define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
3030; GFX12-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
3031; GFX12:       ; %bb.0:
3032; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3033; GFX12-NEXT:    s_wait_expcnt 0x0
3034; GFX12-NEXT:    s_wait_samplecnt 0x0
3035; GFX12-NEXT:    s_wait_bvhcnt 0x0
3036; GFX12-NEXT:    s_wait_kmcnt 0x0
3037; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
3038; GFX12-NEXT:    s_mov_b32 s0, 0
3039; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
3040; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3041; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3042; GFX12-NEXT:    v_sub_f32_e32 v3, v4, v2
3043; GFX12-NEXT:    global_wb scope:SCOPE_SYS
3044; GFX12-NEXT:    s_wait_storecnt 0x0
3045; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3046; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3047; GFX12-NEXT:    global_inv scope:SCOPE_SYS
3048; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
3049; GFX12-NEXT:    v_mov_b32_e32 v4, v3
3050; GFX12-NEXT:    s_wait_alu 0xfffe
3051; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
3052; GFX12-NEXT:    s_wait_alu 0xfffe
3053; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
3054; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
3055; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
3056; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3057; GFX12-NEXT:    s_wait_alu 0xfffe
3058; GFX12-NEXT:    s_setpc_b64 s[30:31]
3059;
3060; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
3061; GFX940:       ; %bb.0:
3062; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3063; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
3064; GFX940-NEXT:    s_mov_b64 s[0:1], 0
3065; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
3066; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
3067; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3068; GFX940-NEXT:    v_sub_f32_e32 v4, v5, v2
3069; GFX940-NEXT:    buffer_wbl2 sc0 sc1
3070; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
3071; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3072; GFX940-NEXT:    buffer_inv sc0 sc1
3073; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
3074; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3075; GFX940-NEXT:    v_mov_b32_e32 v5, v3
3076; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3077; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
3078; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
3079; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
3080; GFX940-NEXT:    s_setpc_b64 s[30:31]
3081;
3082; GFX11-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
3083; GFX11:       ; %bb.0:
3084; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3085; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
3086; GFX11-NEXT:    s_mov_b32 s0, 0
3087; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
3088; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3089; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3090; GFX11-NEXT:    v_sub_f32_e32 v3, v4, v2
3091; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3092; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
3093; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3094; GFX11-NEXT:    buffer_gl1_inv
3095; GFX11-NEXT:    buffer_gl0_inv
3096; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
3097; GFX11-NEXT:    v_mov_b32_e32 v4, v3
3098; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
3099; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3100; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
3101; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
3102; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
3103; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3104; GFX11-NEXT:    s_setpc_b64 s[30:31]
3105;
3106; GFX10-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
3107; GFX10:       ; %bb.0:
3108; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3109; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
3110; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
3111; GFX10-NEXT:    s_mov_b32 s4, 0
3112; GFX10-NEXT:    flat_load_dword v4, v[0:1]
3113; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
3114; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3115; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3116; GFX10-NEXT:    v_sub_f32_e32 v3, v4, v2
3117; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3118; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3119; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3120; GFX10-NEXT:    buffer_gl1_inv
3121; GFX10-NEXT:    buffer_gl0_inv
3122; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
3123; GFX10-NEXT:    v_mov_b32_e32 v4, v3
3124; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
3125; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
3126; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
3127; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3128; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3129; GFX10-NEXT:    s_setpc_b64 s[30:31]
3130;
3131; GFX90A-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
3132; GFX90A:       ; %bb.0:
3133; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3134; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
3135; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
3136; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
3137; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
3138; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3139; GFX90A-NEXT:    v_sub_f32_e32 v4, v5, v2
3140; GFX90A-NEXT:    buffer_wbl2
3141; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
3142; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3143; GFX90A-NEXT:    buffer_invl2
3144; GFX90A-NEXT:    buffer_wbinvl1
3145; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
3146; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3147; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
3148; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3149; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
3150; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
3151; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3152; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3153;
3154; GFX908-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
3155; GFX908:       ; %bb.0:
3156; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3157; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
3158; GFX908-NEXT:    s_mov_b64 s[4:5], 0
3159; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
3160; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
3161; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3162; GFX908-NEXT:    v_sub_f32_e32 v3, v4, v2
3163; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
3164; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3165; GFX908-NEXT:    buffer_wbinvl1
3166; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3167; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3168; GFX908-NEXT:    v_mov_b32_e32 v4, v3
3169; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3170; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
3171; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
3172; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3173; GFX908-NEXT:    s_setpc_b64 s[30:31]
3174;
3175; GFX8-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
3176; GFX8:       ; %bb.0:
3177; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3178; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
3179; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3180; GFX8-NEXT:    flat_load_dword v4, v[0:1]
3181; GFX8-NEXT:    s_mov_b64 s[4:5], 0
3182; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
3183; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3184; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3185; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v2
3186; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3187; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3188; GFX8-NEXT:    buffer_wbinvl1
3189; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3190; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3191; GFX8-NEXT:    v_mov_b32_e32 v4, v3
3192; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3193; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
3194; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
3195; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3196; GFX8-NEXT:    s_setpc_b64 s[30:31]
3197;
3198; GFX7-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
3199; GFX7:       ; %bb.0:
3200; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3201; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
3202; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3203; GFX7-NEXT:    flat_load_dword v4, v[0:1]
3204; GFX7-NEXT:    s_mov_b64 s[4:5], 0
3205; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
3206; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
3207; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3208; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
3209; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3210; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3211; GFX7-NEXT:    buffer_wbinvl1
3212; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3213; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3214; GFX7-NEXT:    v_mov_b32_e32 v4, v3
3215; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3216; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
3217; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
3218; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3219; GFX7-NEXT:    s_setpc_b64 s[30:31]
3220  %gep = getelementptr float, ptr %ptr, i64 511
3221  %unused = atomicrmw fsub ptr %gep, float %val seq_cst
3222  ret void
3223}
3224
3225; --------------------------------------------------------------------
3226; double
3227; --------------------------------------------------------------------
3228
3229define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 {
3230; GFX12-LABEL: flat_agent_atomic_fsub_ret_f64:
3231; GFX12:       ; %bb.0:
3232; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3233; GFX12-NEXT:    s_wait_expcnt 0x0
3234; GFX12-NEXT:    s_wait_samplecnt 0x0
3235; GFX12-NEXT:    s_wait_bvhcnt 0x0
3236; GFX12-NEXT:    s_wait_kmcnt 0x0
3237; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
3238; GFX12-NEXT:    s_mov_b32 s0, exec_lo
3239; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
3240; GFX12-NEXT:    s_wait_alu 0xfffe
3241; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
3242; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
3243; GFX12-NEXT:    s_cbranch_execz .LBB16_4
3244; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
3245; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
3246; GFX12-NEXT:    s_mov_b32 s1, 0
3247; GFX12-NEXT:  .LBB16_2: ; %atomicrmw.start
3248; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3249; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3250; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
3251; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3252; GFX12-NEXT:    v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
3253; GFX12-NEXT:    s_wait_storecnt 0x0
3254; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3255; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3256; GFX12-NEXT:    global_inv scope:SCOPE_DEV
3257; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
3258; GFX12-NEXT:    s_wait_alu 0xfffe
3259; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
3260; GFX12-NEXT:    s_wait_alu 0xfffe
3261; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
3262; GFX12-NEXT:    s_cbranch_execnz .LBB16_2
3263; GFX12-NEXT:  ; %bb.3: ; %Flow
3264; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
3265; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
3266; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
3267; GFX12-NEXT:  .LBB16_4: ; %Flow3
3268; GFX12-NEXT:    s_wait_alu 0xfffe
3269; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
3270; GFX12-NEXT:    s_cbranch_execz .LBB16_6
3271; GFX12-NEXT:  ; %bb.5: ; %atomicrmw.private
3272; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
3273; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
3274; GFX12-NEXT:    scratch_load_b64 v[4:5], v6, off
3275; GFX12-NEXT:    s_wait_loadcnt 0x0
3276; GFX12-NEXT:    v_add_f64_e64 v[0:1], v[4:5], -v[2:3]
3277; GFX12-NEXT:    scratch_store_b64 v6, v[0:1], off
3278; GFX12-NEXT:  .LBB16_6: ; %atomicrmw.phi
3279; GFX12-NEXT:    s_wait_alu 0xfffe
3280; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3281; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
3282; GFX12-NEXT:    s_wait_alu 0xfffe
3283; GFX12-NEXT:    s_setpc_b64 s[30:31]
3284;
3285; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64:
3286; GFX940:       ; %bb.0:
3287; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3288; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
3289; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
3290; GFX940-NEXT:    ; implicit-def: $vgpr4_vgpr5
3291; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3292; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
3293; GFX940-NEXT:    s_cbranch_execz .LBB16_4
3294; GFX940-NEXT:  ; %bb.1: ; %atomicrmw.global
3295; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
3296; GFX940-NEXT:    s_mov_b64 s[2:3], 0
3297; GFX940-NEXT:  .LBB16_2: ; %atomicrmw.start
3298; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
3299; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3300; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
3301; GFX940-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
3302; GFX940-NEXT:    buffer_wbl2 sc1
3303; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
3304; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3305; GFX940-NEXT:    buffer_inv sc1
3306; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3307; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
3308; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
3309; GFX940-NEXT:    s_cbranch_execnz .LBB16_2
3310; GFX940-NEXT:  ; %bb.3: ; %Flow
3311; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
3312; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
3313; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
3314; GFX940-NEXT:  .LBB16_4: ; %Flow3
3315; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
3316; GFX940-NEXT:    s_cbranch_execz .LBB16_6
3317; GFX940-NEXT:  ; %bb.5: ; %atomicrmw.private
3318; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3319; GFX940-NEXT:    s_nop 1
3320; GFX940-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
3321; GFX940-NEXT:    scratch_load_dwordx2 v[4:5], v6, off
3322; GFX940-NEXT:    s_waitcnt vmcnt(0)
3323; GFX940-NEXT:    v_add_f64 v[0:1], v[4:5], -v[2:3]
3324; GFX940-NEXT:    scratch_store_dwordx2 v6, v[0:1], off sc0 sc1
3325; GFX940-NEXT:  .LBB16_6: ; %atomicrmw.phi
3326; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
3327; GFX940-NEXT:    v_mov_b32_e32 v0, v4
3328; GFX940-NEXT:    v_mov_b32_e32 v1, v5
3329; GFX940-NEXT:    s_waitcnt vmcnt(0)
3330; GFX940-NEXT:    s_setpc_b64 s[30:31]
3331;
3332; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64:
3333; GFX11:       ; %bb.0:
3334; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3335; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
3336; GFX11-NEXT:    s_mov_b32 s0, exec_lo
3337; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
3338; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
3339; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
3340; GFX11-NEXT:    s_cbranch_execz .LBB16_4
3341; GFX11-NEXT:  ; %bb.1: ; %atomicrmw.global
3342; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
3343; GFX11-NEXT:    s_mov_b32 s1, 0
3344; GFX11-NEXT:  .LBB16_2: ; %atomicrmw.start
3345; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3346; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3347; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
3348; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3349; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
3350; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3351; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
3352; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3353; GFX11-NEXT:    buffer_gl1_inv
3354; GFX11-NEXT:    buffer_gl0_inv
3355; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
3356; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
3357; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3358; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
3359; GFX11-NEXT:    s_cbranch_execnz .LBB16_2
3360; GFX11-NEXT:  ; %bb.3: ; %Flow
3361; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
3362; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
3363; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
3364; GFX11-NEXT:  .LBB16_4: ; %Flow3
3365; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
3366; GFX11-NEXT:    s_cbranch_execz .LBB16_6
3367; GFX11-NEXT:  ; %bb.5: ; %atomicrmw.private
3368; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
3369; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
3370; GFX11-NEXT:    scratch_load_b64 v[4:5], v6, off
3371; GFX11-NEXT:    s_waitcnt vmcnt(0)
3372; GFX11-NEXT:    v_add_f64 v[0:1], v[4:5], -v[2:3]
3373; GFX11-NEXT:    scratch_store_b64 v6, v[0:1], off
3374; GFX11-NEXT:  .LBB16_6: ; %atomicrmw.phi
3375; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3376; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
3377; GFX11-NEXT:    s_setpc_b64 s[30:31]
3378;
3379; GFX10-LABEL: flat_agent_atomic_fsub_ret_f64:
3380; GFX10:       ; %bb.0:
3381; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3382; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
3383; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
3384; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
3385; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3386; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
3387; GFX10-NEXT:    s_cbranch_execz .LBB16_4
3388; GFX10-NEXT:  ; %bb.1: ; %atomicrmw.global
3389; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
3390; GFX10-NEXT:    s_mov_b32 s5, 0
3391; GFX10-NEXT:  .LBB16_2: ; %atomicrmw.start
3392; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3393; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3394; GFX10-NEXT:    v_mov_b32_e32 v7, v5
3395; GFX10-NEXT:    v_mov_b32_e32 v6, v4
3396; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
3397; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3398; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
3399; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3400; GFX10-NEXT:    buffer_gl1_inv
3401; GFX10-NEXT:    buffer_gl0_inv
3402; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
3403; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
3404; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
3405; GFX10-NEXT:    s_cbranch_execnz .LBB16_2
3406; GFX10-NEXT:  ; %bb.3: ; %Flow
3407; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
3408; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
3409; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
3410; GFX10-NEXT:  .LBB16_4: ; %Flow3
3411; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
3412; GFX10-NEXT:    s_cbranch_execz .LBB16_6
3413; GFX10-NEXT:  ; %bb.5: ; %atomicrmw.private
3414; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
3415; GFX10-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
3416; GFX10-NEXT:    s_clause 0x1
3417; GFX10-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
3418; GFX10-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
3419; GFX10-NEXT:    s_waitcnt vmcnt(0)
3420; GFX10-NEXT:    v_add_f64 v[0:1], v[4:5], -v[2:3]
3421; GFX10-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
3422; GFX10-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
3423; GFX10-NEXT:  .LBB16_6: ; %atomicrmw.phi
3424; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
3425; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3426; GFX10-NEXT:    v_mov_b32_e32 v0, v4
3427; GFX10-NEXT:    v_mov_b32_e32 v1, v5
3428; GFX10-NEXT:    s_setpc_b64 s[30:31]
3429;
3430; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64:
3431; GFX90A:       ; %bb.0:
3432; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3433; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
3434; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
3435; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
3436; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3437; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3438; GFX90A-NEXT:    s_cbranch_execz .LBB16_4
3439; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
3440; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
3441; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
3442; GFX90A-NEXT:  .LBB16_2: ; %atomicrmw.start
3443; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
3444; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3445; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
3446; GFX90A-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
3447; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
3448; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3449; GFX90A-NEXT:    buffer_wbinvl1
3450; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3451; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3452; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3453; GFX90A-NEXT:    s_cbranch_execnz .LBB16_2
3454; GFX90A-NEXT:  ; %bb.3: ; %Flow
3455; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
3456; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
3457; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
3458; GFX90A-NEXT:  .LBB16_4: ; %Flow3
3459; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3460; GFX90A-NEXT:    s_cbranch_execz .LBB16_6
3461; GFX90A-NEXT:  ; %bb.5: ; %atomicrmw.private
3462; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3463; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
3464; GFX90A-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
3465; GFX90A-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
3466; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3467; GFX90A-NEXT:    v_add_f64 v[0:1], v[4:5], -v[2:3]
3468; GFX90A-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
3469; GFX90A-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
3470; GFX90A-NEXT:  .LBB16_6: ; %atomicrmw.phi
3471; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3472; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
3473; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
3474; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3475; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3476;
3477; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64:
3478; GFX908:       ; %bb.0:
3479; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3480; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
3481; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
3482; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
3483; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3484; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3485; GFX908-NEXT:    s_cbranch_execz .LBB16_4
3486; GFX908-NEXT:  ; %bb.1: ; %atomicrmw.global
3487; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
3488; GFX908-NEXT:    s_mov_b64 s[6:7], 0
3489; GFX908-NEXT:  .LBB16_2: ; %atomicrmw.start
3490; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
3491; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3492; GFX908-NEXT:    v_mov_b32_e32 v7, v5
3493; GFX908-NEXT:    v_mov_b32_e32 v6, v4
3494; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
3495; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
3496; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3497; GFX908-NEXT:    buffer_wbinvl1
3498; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3499; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3500; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3501; GFX908-NEXT:    s_cbranch_execnz .LBB16_2
3502; GFX908-NEXT:  ; %bb.3: ; %Flow
3503; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
3504; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
3505; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
3506; GFX908-NEXT:  .LBB16_4: ; %Flow3
3507; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3508; GFX908-NEXT:    s_cbranch_execz .LBB16_6
3509; GFX908-NEXT:  ; %bb.5: ; %atomicrmw.private
3510; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3511; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
3512; GFX908-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
3513; GFX908-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
3514; GFX908-NEXT:    s_waitcnt vmcnt(0)
3515; GFX908-NEXT:    v_add_f64 v[0:1], v[4:5], -v[2:3]
3516; GFX908-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
3517; GFX908-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
3518; GFX908-NEXT:  .LBB16_6: ; %atomicrmw.phi
3519; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3520; GFX908-NEXT:    v_mov_b32_e32 v0, v4
3521; GFX908-NEXT:    v_mov_b32_e32 v1, v5
3522; GFX908-NEXT:    s_waitcnt vmcnt(0)
3523; GFX908-NEXT:    s_setpc_b64 s[30:31]
3524;
3525; GFX8-LABEL: flat_agent_atomic_fsub_ret_f64:
3526; GFX8:       ; %bb.0:
3527; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3528; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
3529; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
3530; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
3531; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3532; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
3533; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3534; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3535; GFX8-NEXT:    s_cbranch_execz .LBB16_4
3536; GFX8-NEXT:  ; %bb.1: ; %atomicrmw.global
3537; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
3538; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
3539; GFX8-NEXT:    flat_load_dword v5, v[4:5]
3540; GFX8-NEXT:    flat_load_dword v4, v[0:1]
3541; GFX8-NEXT:    s_mov_b64 s[6:7], 0
3542; GFX8-NEXT:  .LBB16_2: ; %atomicrmw.start
3543; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3544; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3545; GFX8-NEXT:    v_mov_b32_e32 v7, v5
3546; GFX8-NEXT:    v_mov_b32_e32 v6, v4
3547; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
3548; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
3549; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3550; GFX8-NEXT:    buffer_wbinvl1
3551; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3552; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3553; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3554; GFX8-NEXT:    s_cbranch_execnz .LBB16_2
3555; GFX8-NEXT:  ; %bb.3: ; %Flow
3556; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
3557; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3558; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
3559; GFX8-NEXT:  .LBB16_4: ; %Flow3
3560; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3561; GFX8-NEXT:    s_cbranch_execz .LBB16_6
3562; GFX8-NEXT:  ; %bb.5: ; %atomicrmw.private
3563; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3564; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
3565; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
3566; GFX8-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
3567; GFX8-NEXT:    buffer_load_dword v5, v7, s[0:3], 0 offen
3568; GFX8-NEXT:    s_waitcnt vmcnt(0)
3569; GFX8-NEXT:    v_add_f64 v[0:1], v[4:5], -v[2:3]
3570; GFX8-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
3571; GFX8-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
3572; GFX8-NEXT:  .LBB16_6: ; %atomicrmw.phi
3573; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3574; GFX8-NEXT:    v_mov_b32_e32 v0, v4
3575; GFX8-NEXT:    v_mov_b32_e32 v1, v5
3576; GFX8-NEXT:    s_waitcnt vmcnt(0)
3577; GFX8-NEXT:    s_setpc_b64 s[30:31]
3578;
3579; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64:
3580; GFX7:       ; %bb.0:
3581; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3582; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
3583; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
3584; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
3585; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3586; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
3587; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3588; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3589; GFX7-NEXT:    s_cbranch_execz .LBB16_4
3590; GFX7-NEXT:  ; %bb.1: ; %atomicrmw.global
3591; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
3592; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
3593; GFX7-NEXT:    flat_load_dword v5, v[4:5]
3594; GFX7-NEXT:    flat_load_dword v4, v[0:1]
3595; GFX7-NEXT:    s_mov_b64 s[6:7], 0
3596; GFX7-NEXT:  .LBB16_2: ; %atomicrmw.start
3597; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
3598; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3599; GFX7-NEXT:    v_mov_b32_e32 v7, v5
3600; GFX7-NEXT:    v_mov_b32_e32 v6, v4
3601; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
3602; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
3603; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3604; GFX7-NEXT:    buffer_wbinvl1
3605; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3606; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3607; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3608; GFX7-NEXT:    s_cbranch_execnz .LBB16_2
3609; GFX7-NEXT:  ; %bb.3: ; %Flow
3610; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
3611; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
3612; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
3613; GFX7-NEXT:  .LBB16_4: ; %Flow3
3614; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3615; GFX7-NEXT:    s_cbranch_execz .LBB16_6
3616; GFX7-NEXT:  ; %bb.5: ; %atomicrmw.private
3617; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3618; GFX7-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
3619; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 4, v6
3620; GFX7-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
3621; GFX7-NEXT:    buffer_load_dword v5, v7, s[0:3], 0 offen
3622; GFX7-NEXT:    s_waitcnt vmcnt(0)
3623; GFX7-NEXT:    v_add_f64 v[0:1], v[4:5], -v[2:3]
3624; GFX7-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
3625; GFX7-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
3626; GFX7-NEXT:  .LBB16_6: ; %atomicrmw.phi
3627; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3628; GFX7-NEXT:    v_mov_b32_e32 v0, v4
3629; GFX7-NEXT:    v_mov_b32_e32 v1, v5
3630; GFX7-NEXT:    s_waitcnt vmcnt(0)
3631; GFX7-NEXT:    s_setpc_b64 s[30:31]
3632  %result = atomicrmw fsub ptr %ptr, double %val syncscope("agent") seq_cst
3633  ret double %result
3634}
3635
3636define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %val) #0 {
3637; GFX12-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
3638; GFX12:       ; %bb.0:
3639; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3640; GFX12-NEXT:    s_wait_expcnt 0x0
3641; GFX12-NEXT:    s_wait_samplecnt 0x0
3642; GFX12-NEXT:    s_wait_bvhcnt 0x0
3643; GFX12-NEXT:    s_wait_kmcnt 0x0
3644; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
3645; GFX12-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
3646; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
3647; GFX12-NEXT:    s_mov_b32 s0, exec_lo
3648; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
3649; GFX12-NEXT:    s_wait_alu 0xfffe
3650; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3651; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v5
3652; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
3653; GFX12-NEXT:    s_cbranch_execnz .LBB17_3
3654; GFX12-NEXT:  ; %bb.1: ; %Flow3
3655; GFX12-NEXT:    s_wait_alu 0xfffe
3656; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
3657; GFX12-NEXT:    s_cbranch_execnz .LBB17_6
3658; GFX12-NEXT:  .LBB17_2: ; %atomicrmw.phi
3659; GFX12-NEXT:    s_wait_alu 0xfffe
3660; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3661; GFX12-NEXT:    s_wait_alu 0xfffe
3662; GFX12-NEXT:    s_setpc_b64 s[30:31]
3663; GFX12-NEXT:  .LBB17_3: ; %atomicrmw.global
3664; GFX12-NEXT:    flat_load_b64 v[0:1], v[4:5]
3665; GFX12-NEXT:    s_mov_b32 s1, 0
3666; GFX12-NEXT:  .LBB17_4: ; %atomicrmw.start
3667; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3668; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3669; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
3670; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3671; GFX12-NEXT:    v_add_f64_e64 v[6:7], v[8:9], -v[2:3]
3672; GFX12-NEXT:    s_wait_storecnt 0x0
3673; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3674; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3675; GFX12-NEXT:    global_inv scope:SCOPE_DEV
3676; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
3677; GFX12-NEXT:    s_wait_alu 0xfffe
3678; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
3679; GFX12-NEXT:    s_wait_alu 0xfffe
3680; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
3681; GFX12-NEXT:    s_cbranch_execnz .LBB17_4
3682; GFX12-NEXT:  ; %bb.5: ; %Flow
3683; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
3684; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
3685; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
3686; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
3687; GFX12-NEXT:    s_cbranch_execz .LBB17_2
3688; GFX12-NEXT:  .LBB17_6: ; %atomicrmw.private
3689; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
3690; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
3691; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
3692; GFX12-NEXT:    s_wait_loadcnt 0x0
3693; GFX12-NEXT:    v_add_f64_e64 v[2:3], v[0:1], -v[2:3]
3694; GFX12-NEXT:    scratch_store_b64 v4, v[2:3], off
3695; GFX12-NEXT:    s_wait_alu 0xfffe
3696; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3697; GFX12-NEXT:    s_wait_alu 0xfffe
3698; GFX12-NEXT:    s_setpc_b64 s[30:31]
3699;
3700; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
3701; GFX940:       ; %bb.0:
3702; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3703; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7f8
3704; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
3705; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
3706; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
3707; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
3708; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3709; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
3710; GFX940-NEXT:    s_cbranch_execnz .LBB17_3
3711; GFX940-NEXT:  ; %bb.1: ; %Flow3
3712; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
3713; GFX940-NEXT:    s_cbranch_execnz .LBB17_6
3714; GFX940-NEXT:  .LBB17_2: ; %atomicrmw.phi
3715; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
3716; GFX940-NEXT:    s_setpc_b64 s[30:31]
3717; GFX940-NEXT:  .LBB17_3: ; %atomicrmw.global
3718; GFX940-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
3719; GFX940-NEXT:    s_mov_b64 s[2:3], 0
3720; GFX940-NEXT:  .LBB17_4: ; %atomicrmw.start
3721; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
3722; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3723; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
3724; GFX940-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
3725; GFX940-NEXT:    buffer_wbl2 sc1
3726; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0
3727; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3728; GFX940-NEXT:    buffer_inv sc1
3729; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
3730; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
3731; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
3732; GFX940-NEXT:    s_cbranch_execnz .LBB17_4
3733; GFX940-NEXT:  ; %bb.5: ; %Flow
3734; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
3735; GFX940-NEXT:    ; implicit-def: $vgpr4_vgpr5
3736; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
3737; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
3738; GFX940-NEXT:    s_cbranch_execz .LBB17_2
3739; GFX940-NEXT:  .LBB17_6: ; %atomicrmw.private
3740; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3741; GFX940-NEXT:    s_nop 1
3742; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
3743; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
3744; GFX940-NEXT:    s_waitcnt vmcnt(0)
3745; GFX940-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
3746; GFX940-NEXT:    scratch_store_dwordx2 v4, v[2:3], off sc0 sc1
3747; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
3748; GFX940-NEXT:    s_waitcnt vmcnt(0)
3749; GFX940-NEXT:    s_setpc_b64 s[30:31]
3750;
3751; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
3752; GFX11:       ; %bb.0:
3753; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3754; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
3755; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
3756; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
3757; GFX11-NEXT:    s_mov_b32 s0, exec_lo
3758; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
3759; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3760; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v5
3761; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
3762; GFX11-NEXT:    s_cbranch_execnz .LBB17_3
3763; GFX11-NEXT:  ; %bb.1: ; %Flow3
3764; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
3765; GFX11-NEXT:    s_cbranch_execnz .LBB17_6
3766; GFX11-NEXT:  .LBB17_2: ; %atomicrmw.phi
3767; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3768; GFX11-NEXT:    s_setpc_b64 s[30:31]
3769; GFX11-NEXT:  .LBB17_3: ; %atomicrmw.global
3770; GFX11-NEXT:    flat_load_b64 v[0:1], v[4:5]
3771; GFX11-NEXT:    s_mov_b32 s1, 0
3772; GFX11-NEXT:  .LBB17_4: ; %atomicrmw.start
3773; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3774; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3775; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
3776; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3777; GFX11-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
3778; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3779; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
3780; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3781; GFX11-NEXT:    buffer_gl1_inv
3782; GFX11-NEXT:    buffer_gl0_inv
3783; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
3784; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
3785; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3786; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
3787; GFX11-NEXT:    s_cbranch_execnz .LBB17_4
3788; GFX11-NEXT:  ; %bb.5: ; %Flow
3789; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
3790; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
3791; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
3792; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
3793; GFX11-NEXT:    s_cbranch_execz .LBB17_2
3794; GFX11-NEXT:  .LBB17_6: ; %atomicrmw.private
3795; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
3796; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
3797; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
3798; GFX11-NEXT:    s_waitcnt vmcnt(0)
3799; GFX11-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
3800; GFX11-NEXT:    scratch_store_b64 v4, v[2:3], off
3801; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3802; GFX11-NEXT:    s_setpc_b64 s[30:31]
3803;
3804; GFX10-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
3805; GFX10:       ; %bb.0:
3806; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3807; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
3808; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
3809; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
3810; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
3811; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v5
3812; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3813; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
3814; GFX10-NEXT:    s_cbranch_execnz .LBB17_3
3815; GFX10-NEXT:  ; %bb.1: ; %Flow3
3816; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
3817; GFX10-NEXT:    s_cbranch_execnz .LBB17_6
3818; GFX10-NEXT:  .LBB17_2: ; %atomicrmw.phi
3819; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3820; GFX10-NEXT:    s_setpc_b64 s[30:31]
3821; GFX10-NEXT:  .LBB17_3: ; %atomicrmw.global
3822; GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
3823; GFX10-NEXT:    s_mov_b32 s5, 0
3824; GFX10-NEXT:  .LBB17_4: ; %atomicrmw.start
3825; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3826; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3827; GFX10-NEXT:    v_mov_b32_e32 v9, v1
3828; GFX10-NEXT:    v_mov_b32_e32 v8, v0
3829; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
3830; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3831; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
3832; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3833; GFX10-NEXT:    buffer_gl1_inv
3834; GFX10-NEXT:    buffer_gl0_inv
3835; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
3836; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
3837; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
3838; GFX10-NEXT:    s_cbranch_execnz .LBB17_4
3839; GFX10-NEXT:  ; %bb.5: ; %Flow
3840; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
3841; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
3842; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
3843; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
3844; GFX10-NEXT:    s_cbranch_execz .LBB17_2
3845; GFX10-NEXT:  .LBB17_6: ; %atomicrmw.private
3846; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
3847; GFX10-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
3848; GFX10-NEXT:    s_clause 0x1
3849; GFX10-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
3850; GFX10-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
3851; GFX10-NEXT:    s_waitcnt vmcnt(0)
3852; GFX10-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
3853; GFX10-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
3854; GFX10-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
3855; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
3856; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3857; GFX10-NEXT:    s_setpc_b64 s[30:31]
3858;
3859; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
3860; GFX90A:       ; %bb.0:
3861; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3862; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
3863; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
3864; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
3865; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
3866; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
3867; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3868; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3869; GFX90A-NEXT:    s_cbranch_execnz .LBB17_3
3870; GFX90A-NEXT:  ; %bb.1: ; %Flow3
3871; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3872; GFX90A-NEXT:    s_cbranch_execnz .LBB17_6
3873; GFX90A-NEXT:  .LBB17_2: ; %atomicrmw.phi
3874; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3875; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3876; GFX90A-NEXT:  .LBB17_3: ; %atomicrmw.global
3877; GFX90A-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
3878; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
3879; GFX90A-NEXT:  .LBB17_4: ; %atomicrmw.start
3880; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
3881; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3882; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
3883; GFX90A-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
3884; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
3885; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3886; GFX90A-NEXT:    buffer_wbinvl1
3887; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
3888; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3889; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3890; GFX90A-NEXT:    s_cbranch_execnz .LBB17_4
3891; GFX90A-NEXT:  ; %bb.5: ; %Flow
3892; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
3893; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
3894; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
3895; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3896; GFX90A-NEXT:    s_cbranch_execz .LBB17_2
3897; GFX90A-NEXT:  .LBB17_6: ; %atomicrmw.private
3898; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3899; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
3900; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
3901; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
3902; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3903; GFX90A-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
3904; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
3905; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
3906; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3907; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3908; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3909;
3910; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
3911; GFX908:       ; %bb.0:
3912; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3913; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
3914; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
3915; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
3916; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
3917; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
3918; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3919; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3920; GFX908-NEXT:    s_cbranch_execnz .LBB17_3
3921; GFX908-NEXT:  ; %bb.1: ; %Flow3
3922; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3923; GFX908-NEXT:    s_cbranch_execnz .LBB17_6
3924; GFX908-NEXT:  .LBB17_2: ; %atomicrmw.phi
3925; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3926; GFX908-NEXT:    s_setpc_b64 s[30:31]
3927; GFX908-NEXT:  .LBB17_3: ; %atomicrmw.global
3928; GFX908-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
3929; GFX908-NEXT:    s_mov_b64 s[6:7], 0
3930; GFX908-NEXT:  .LBB17_4: ; %atomicrmw.start
3931; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
3932; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3933; GFX908-NEXT:    v_mov_b32_e32 v9, v1
3934; GFX908-NEXT:    v_mov_b32_e32 v8, v0
3935; GFX908-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
3936; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
3937; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3938; GFX908-NEXT:    buffer_wbinvl1
3939; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
3940; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3941; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3942; GFX908-NEXT:    s_cbranch_execnz .LBB17_4
3943; GFX908-NEXT:  ; %bb.5: ; %Flow
3944; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
3945; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
3946; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
3947; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3948; GFX908-NEXT:    s_cbranch_execz .LBB17_2
3949; GFX908-NEXT:  .LBB17_6: ; %atomicrmw.private
3950; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3951; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
3952; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
3953; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
3954; GFX908-NEXT:    s_waitcnt vmcnt(0)
3955; GFX908-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
3956; GFX908-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
3957; GFX908-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
3958; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3959; GFX908-NEXT:    s_waitcnt vmcnt(0)
3960; GFX908-NEXT:    s_setpc_b64 s[30:31]
3961;
3962; GFX8-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
3963; GFX8:       ; %bb.0:
3964; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3965; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
3966; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
3967; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
3968; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
3969; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3970; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
3971; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3972; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3973; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3974; GFX8-NEXT:    s_cbranch_execnz .LBB17_3
3975; GFX8-NEXT:  ; %bb.1: ; %Flow3
3976; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3977; GFX8-NEXT:    s_cbranch_execnz .LBB17_6
3978; GFX8-NEXT:  .LBB17_2: ; %atomicrmw.phi
3979; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3980; GFX8-NEXT:    s_setpc_b64 s[30:31]
3981; GFX8-NEXT:  .LBB17_3: ; %atomicrmw.global
3982; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
3983; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
3984; GFX8-NEXT:    flat_load_dword v1, v[0:1]
3985; GFX8-NEXT:    flat_load_dword v0, v[4:5]
3986; GFX8-NEXT:    s_mov_b64 s[6:7], 0
3987; GFX8-NEXT:  .LBB17_4: ; %atomicrmw.start
3988; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3989; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3990; GFX8-NEXT:    v_mov_b32_e32 v9, v1
3991; GFX8-NEXT:    v_mov_b32_e32 v8, v0
3992; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
3993; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
3994; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3995; GFX8-NEXT:    buffer_wbinvl1
3996; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
3997; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3998; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3999; GFX8-NEXT:    s_cbranch_execnz .LBB17_4
4000; GFX8-NEXT:  ; %bb.5: ; %Flow
4001; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
4002; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
4003; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
4004; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4005; GFX8-NEXT:    s_cbranch_execz .LBB17_2
4006; GFX8-NEXT:  .LBB17_6: ; %atomicrmw.private
4007; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4008; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
4009; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
4010; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4011; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
4012; GFX8-NEXT:    s_waitcnt vmcnt(0)
4013; GFX8-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
4014; GFX8-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
4015; GFX8-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
4016; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4017; GFX8-NEXT:    s_waitcnt vmcnt(0)
4018; GFX8-NEXT:    s_setpc_b64 s[30:31]
4019;
4020; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
4021; GFX7:       ; %bb.0:
4022; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4023; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
4024; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
4025; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7f8, v0
4026; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
4027; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4028; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
4029; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
4030; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4031; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4032; GFX7-NEXT:    s_cbranch_execnz .LBB17_3
4033; GFX7-NEXT:  ; %bb.1: ; %Flow3
4034; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4035; GFX7-NEXT:    s_cbranch_execnz .LBB17_6
4036; GFX7-NEXT:  .LBB17_2: ; %atomicrmw.phi
4037; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4038; GFX7-NEXT:    s_setpc_b64 s[30:31]
4039; GFX7-NEXT:  .LBB17_3: ; %atomicrmw.global
4040; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v4
4041; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
4042; GFX7-NEXT:    flat_load_dword v1, v[0:1]
4043; GFX7-NEXT:    flat_load_dword v0, v[4:5]
4044; GFX7-NEXT:    s_mov_b64 s[6:7], 0
4045; GFX7-NEXT:  .LBB17_4: ; %atomicrmw.start
4046; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
4047; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4048; GFX7-NEXT:    v_mov_b32_e32 v9, v1
4049; GFX7-NEXT:    v_mov_b32_e32 v8, v0
4050; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
4051; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
4052; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4053; GFX7-NEXT:    buffer_wbinvl1
4054; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4055; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4056; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4057; GFX7-NEXT:    s_cbranch_execnz .LBB17_4
4058; GFX7-NEXT:  ; %bb.5: ; %Flow
4059; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
4060; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
4061; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
4062; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4063; GFX7-NEXT:    s_cbranch_execz .LBB17_2
4064; GFX7-NEXT:  .LBB17_6: ; %atomicrmw.private
4065; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4066; GFX7-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
4067; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v4
4068; GFX7-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4069; GFX7-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
4070; GFX7-NEXT:    s_waitcnt vmcnt(0)
4071; GFX7-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
4072; GFX7-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
4073; GFX7-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
4074; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4075; GFX7-NEXT:    s_waitcnt vmcnt(0)
4076; GFX7-NEXT:    s_setpc_b64 s[30:31]
4077  %gep = getelementptr double, ptr %ptr, i64 255
4078  %result = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst
4079  ret double %result
4080}
4081
4082define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %val) #0 {
4083; GFX12-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
4084; GFX12:       ; %bb.0:
4085; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4086; GFX12-NEXT:    s_wait_expcnt 0x0
4087; GFX12-NEXT:    s_wait_samplecnt 0x0
4088; GFX12-NEXT:    s_wait_bvhcnt 0x0
4089; GFX12-NEXT:    s_wait_kmcnt 0x0
4090; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
4091; GFX12-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
4092; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
4093; GFX12-NEXT:    s_mov_b32 s0, exec_lo
4094; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
4095; GFX12-NEXT:    s_wait_alu 0xfffe
4096; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4097; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v5
4098; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
4099; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
4100; GFX12-NEXT:  ; %bb.1: ; %Flow3
4101; GFX12-NEXT:    s_wait_alu 0xfffe
4102; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
4103; GFX12-NEXT:    s_cbranch_execnz .LBB18_6
4104; GFX12-NEXT:  .LBB18_2: ; %atomicrmw.phi
4105; GFX12-NEXT:    s_wait_alu 0xfffe
4106; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4107; GFX12-NEXT:    s_wait_alu 0xfffe
4108; GFX12-NEXT:    s_setpc_b64 s[30:31]
4109; GFX12-NEXT:  .LBB18_3: ; %atomicrmw.global
4110; GFX12-NEXT:    flat_load_b64 v[0:1], v[4:5]
4111; GFX12-NEXT:    s_mov_b32 s1, 0
4112; GFX12-NEXT:  .LBB18_4: ; %atomicrmw.start
4113; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
4114; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4115; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
4116; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4117; GFX12-NEXT:    v_add_f64_e64 v[6:7], v[8:9], -v[2:3]
4118; GFX12-NEXT:    s_wait_storecnt 0x0
4119; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
4120; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4121; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4122; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
4123; GFX12-NEXT:    s_wait_alu 0xfffe
4124; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
4125; GFX12-NEXT:    s_wait_alu 0xfffe
4126; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4127; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
4128; GFX12-NEXT:  ; %bb.5: ; %Flow
4129; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4130; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
4131; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
4132; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
4133; GFX12-NEXT:    s_cbranch_execz .LBB18_2
4134; GFX12-NEXT:  .LBB18_6: ; %atomicrmw.private
4135; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
4136; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
4137; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
4138; GFX12-NEXT:    s_wait_loadcnt 0x0
4139; GFX12-NEXT:    v_add_f64_e64 v[2:3], v[0:1], -v[2:3]
4140; GFX12-NEXT:    scratch_store_b64 v4, v[2:3], off
4141; GFX12-NEXT:    s_wait_alu 0xfffe
4142; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4143; GFX12-NEXT:    s_wait_alu 0xfffe
4144; GFX12-NEXT:    s_setpc_b64 s[30:31]
4145;
4146; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
4147; GFX940:       ; %bb.0:
4148; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4149; GFX940-NEXT:    s_movk_i32 s0, 0xf800
4150; GFX940-NEXT:    s_mov_b32 s1, -1
4151; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
4152; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
4153; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
4154; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
4155; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4156; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
4157; GFX940-NEXT:    s_cbranch_execnz .LBB18_3
4158; GFX940-NEXT:  ; %bb.1: ; %Flow3
4159; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
4160; GFX940-NEXT:    s_cbranch_execnz .LBB18_6
4161; GFX940-NEXT:  .LBB18_2: ; %atomicrmw.phi
4162; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
4163; GFX940-NEXT:    s_setpc_b64 s[30:31]
4164; GFX940-NEXT:  .LBB18_3: ; %atomicrmw.global
4165; GFX940-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
4166; GFX940-NEXT:    s_mov_b64 s[2:3], 0
4167; GFX940-NEXT:  .LBB18_4: ; %atomicrmw.start
4168; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
4169; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4170; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
4171; GFX940-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
4172; GFX940-NEXT:    buffer_wbl2 sc1
4173; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0
4174; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4175; GFX940-NEXT:    buffer_inv sc1
4176; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4177; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
4178; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
4179; GFX940-NEXT:    s_cbranch_execnz .LBB18_4
4180; GFX940-NEXT:  ; %bb.5: ; %Flow
4181; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
4182; GFX940-NEXT:    ; implicit-def: $vgpr4_vgpr5
4183; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
4184; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
4185; GFX940-NEXT:    s_cbranch_execz .LBB18_2
4186; GFX940-NEXT:  .LBB18_6: ; %atomicrmw.private
4187; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4188; GFX940-NEXT:    s_nop 1
4189; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
4190; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
4191; GFX940-NEXT:    s_waitcnt vmcnt(0)
4192; GFX940-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
4193; GFX940-NEXT:    scratch_store_dwordx2 v4, v[2:3], off sc0 sc1
4194; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
4195; GFX940-NEXT:    s_waitcnt vmcnt(0)
4196; GFX940-NEXT:    s_setpc_b64 s[30:31]
4197;
4198; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
4199; GFX11:       ; %bb.0:
4200; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4201; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
4202; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
4203; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
4204; GFX11-NEXT:    s_mov_b32 s0, exec_lo
4205; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
4206; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4207; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v5
4208; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
4209; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
4210; GFX11-NEXT:  ; %bb.1: ; %Flow3
4211; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
4212; GFX11-NEXT:    s_cbranch_execnz .LBB18_6
4213; GFX11-NEXT:  .LBB18_2: ; %atomicrmw.phi
4214; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4215; GFX11-NEXT:    s_setpc_b64 s[30:31]
4216; GFX11-NEXT:  .LBB18_3: ; %atomicrmw.global
4217; GFX11-NEXT:    flat_load_b64 v[0:1], v[4:5]
4218; GFX11-NEXT:    s_mov_b32 s1, 0
4219; GFX11-NEXT:  .LBB18_4: ; %atomicrmw.start
4220; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
4221; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4222; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
4223; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4224; GFX11-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
4225; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4226; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
4227; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4228; GFX11-NEXT:    buffer_gl1_inv
4229; GFX11-NEXT:    buffer_gl0_inv
4230; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
4231; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
4232; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4233; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4234; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
4235; GFX11-NEXT:  ; %bb.5: ; %Flow
4236; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4237; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
4238; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
4239; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
4240; GFX11-NEXT:    s_cbranch_execz .LBB18_2
4241; GFX11-NEXT:  .LBB18_6: ; %atomicrmw.private
4242; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
4243; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
4244; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
4245; GFX11-NEXT:    s_waitcnt vmcnt(0)
4246; GFX11-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
4247; GFX11-NEXT:    scratch_store_b64 v4, v[2:3], off
4248; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4249; GFX11-NEXT:    s_setpc_b64 s[30:31]
4250;
4251; GFX10-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
4252; GFX10:       ; %bb.0:
4253; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4254; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
4255; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
4256; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
4257; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
4258; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v5
4259; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4260; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
4261; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
4262; GFX10-NEXT:  ; %bb.1: ; %Flow3
4263; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
4264; GFX10-NEXT:    s_cbranch_execnz .LBB18_6
4265; GFX10-NEXT:  .LBB18_2: ; %atomicrmw.phi
4266; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4267; GFX10-NEXT:    s_setpc_b64 s[30:31]
4268; GFX10-NEXT:  .LBB18_3: ; %atomicrmw.global
4269; GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
4270; GFX10-NEXT:    s_mov_b32 s5, 0
4271; GFX10-NEXT:  .LBB18_4: ; %atomicrmw.start
4272; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
4273; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4274; GFX10-NEXT:    v_mov_b32_e32 v9, v1
4275; GFX10-NEXT:    v_mov_b32_e32 v8, v0
4276; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
4277; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4278; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
4279; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4280; GFX10-NEXT:    buffer_gl1_inv
4281; GFX10-NEXT:    buffer_gl0_inv
4282; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
4283; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
4284; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
4285; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
4286; GFX10-NEXT:  ; %bb.5: ; %Flow
4287; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
4288; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
4289; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
4290; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
4291; GFX10-NEXT:    s_cbranch_execz .LBB18_2
4292; GFX10-NEXT:  .LBB18_6: ; %atomicrmw.private
4293; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
4294; GFX10-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
4295; GFX10-NEXT:    s_clause 0x1
4296; GFX10-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4297; GFX10-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
4298; GFX10-NEXT:    s_waitcnt vmcnt(0)
4299; GFX10-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
4300; GFX10-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
4301; GFX10-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
4302; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4303; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4304; GFX10-NEXT:    s_setpc_b64 s[30:31]
4305;
4306; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
4307; GFX90A:       ; %bb.0:
4308; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4309; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
4310; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
4311; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
4312; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
4313; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
4314; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4315; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4316; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
4317; GFX90A-NEXT:  ; %bb.1: ; %Flow3
4318; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4319; GFX90A-NEXT:    s_cbranch_execnz .LBB18_6
4320; GFX90A-NEXT:  .LBB18_2: ; %atomicrmw.phi
4321; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4322; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4323; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.global
4324; GFX90A-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
4325; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
4326; GFX90A-NEXT:  .LBB18_4: ; %atomicrmw.start
4327; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
4328; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4329; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
4330; GFX90A-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
4331; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
4332; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4333; GFX90A-NEXT:    buffer_wbinvl1
4334; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4335; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4336; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4337; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
4338; GFX90A-NEXT:  ; %bb.5: ; %Flow
4339; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
4340; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
4341; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
4342; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4343; GFX90A-NEXT:    s_cbranch_execz .LBB18_2
4344; GFX90A-NEXT:  .LBB18_6: ; %atomicrmw.private
4345; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4346; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
4347; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4348; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
4349; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4350; GFX90A-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
4351; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
4352; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
4353; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4354; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4355; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4356;
4357; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
4358; GFX908:       ; %bb.0:
4359; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4360; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
4361; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
4362; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
4363; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
4364; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
4365; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4366; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4367; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
4368; GFX908-NEXT:  ; %bb.1: ; %Flow3
4369; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4370; GFX908-NEXT:    s_cbranch_execnz .LBB18_6
4371; GFX908-NEXT:  .LBB18_2: ; %atomicrmw.phi
4372; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4373; GFX908-NEXT:    s_setpc_b64 s[30:31]
4374; GFX908-NEXT:  .LBB18_3: ; %atomicrmw.global
4375; GFX908-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
4376; GFX908-NEXT:    s_mov_b64 s[6:7], 0
4377; GFX908-NEXT:  .LBB18_4: ; %atomicrmw.start
4378; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
4379; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4380; GFX908-NEXT:    v_mov_b32_e32 v9, v1
4381; GFX908-NEXT:    v_mov_b32_e32 v8, v0
4382; GFX908-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
4383; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
4384; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4385; GFX908-NEXT:    buffer_wbinvl1
4386; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4387; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4388; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4389; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
4390; GFX908-NEXT:  ; %bb.5: ; %Flow
4391; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
4392; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
4393; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
4394; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4395; GFX908-NEXT:    s_cbranch_execz .LBB18_2
4396; GFX908-NEXT:  .LBB18_6: ; %atomicrmw.private
4397; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4398; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
4399; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4400; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
4401; GFX908-NEXT:    s_waitcnt vmcnt(0)
4402; GFX908-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
4403; GFX908-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
4404; GFX908-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
4405; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4406; GFX908-NEXT:    s_waitcnt vmcnt(0)
4407; GFX908-NEXT:    s_setpc_b64 s[30:31]
4408;
4409; GFX8-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
4410; GFX8:       ; %bb.0:
4411; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4412; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
4413; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
4414; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
4415; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
4416; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4417; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
4418; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4419; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4420; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4421; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
4422; GFX8-NEXT:  ; %bb.1: ; %Flow3
4423; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4424; GFX8-NEXT:    s_cbranch_execnz .LBB18_6
4425; GFX8-NEXT:  .LBB18_2: ; %atomicrmw.phi
4426; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4427; GFX8-NEXT:    s_setpc_b64 s[30:31]
4428; GFX8-NEXT:  .LBB18_3: ; %atomicrmw.global
4429; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
4430; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
4431; GFX8-NEXT:    flat_load_dword v1, v[0:1]
4432; GFX8-NEXT:    flat_load_dword v0, v[4:5]
4433; GFX8-NEXT:    s_mov_b64 s[6:7], 0
4434; GFX8-NEXT:  .LBB18_4: ; %atomicrmw.start
4435; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
4436; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4437; GFX8-NEXT:    v_mov_b32_e32 v9, v1
4438; GFX8-NEXT:    v_mov_b32_e32 v8, v0
4439; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
4440; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
4441; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4442; GFX8-NEXT:    buffer_wbinvl1
4443; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4444; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4445; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4446; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
4447; GFX8-NEXT:  ; %bb.5: ; %Flow
4448; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
4449; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
4450; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
4451; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4452; GFX8-NEXT:    s_cbranch_execz .LBB18_2
4453; GFX8-NEXT:  .LBB18_6: ; %atomicrmw.private
4454; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4455; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
4456; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
4457; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4458; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
4459; GFX8-NEXT:    s_waitcnt vmcnt(0)
4460; GFX8-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
4461; GFX8-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
4462; GFX8-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
4463; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4464; GFX8-NEXT:    s_waitcnt vmcnt(0)
4465; GFX8-NEXT:    s_setpc_b64 s[30:31]
4466;
4467; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
4468; GFX7:       ; %bb.0:
4469; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4470; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
4471; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
4472; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
4473; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
4474; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4475; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
4476; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
4477; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4478; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4479; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
4480; GFX7-NEXT:  ; %bb.1: ; %Flow3
4481; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4482; GFX7-NEXT:    s_cbranch_execnz .LBB18_6
4483; GFX7-NEXT:  .LBB18_2: ; %atomicrmw.phi
4484; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4485; GFX7-NEXT:    s_setpc_b64 s[30:31]
4486; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.global
4487; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v4
4488; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
4489; GFX7-NEXT:    flat_load_dword v1, v[0:1]
4490; GFX7-NEXT:    flat_load_dword v0, v[4:5]
4491; GFX7-NEXT:    s_mov_b64 s[6:7], 0
4492; GFX7-NEXT:  .LBB18_4: ; %atomicrmw.start
4493; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
4494; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4495; GFX7-NEXT:    v_mov_b32_e32 v9, v1
4496; GFX7-NEXT:    v_mov_b32_e32 v8, v0
4497; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
4498; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
4499; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4500; GFX7-NEXT:    buffer_wbinvl1
4501; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4502; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4503; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4504; GFX7-NEXT:    s_cbranch_execnz .LBB18_4
4505; GFX7-NEXT:  ; %bb.5: ; %Flow
4506; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
4507; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
4508; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
4509; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4510; GFX7-NEXT:    s_cbranch_execz .LBB18_2
4511; GFX7-NEXT:  .LBB18_6: ; %atomicrmw.private
4512; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4513; GFX7-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
4514; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v4
4515; GFX7-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4516; GFX7-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
4517; GFX7-NEXT:    s_waitcnt vmcnt(0)
4518; GFX7-NEXT:    v_add_f64 v[2:3], v[0:1], -v[2:3]
4519; GFX7-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
4520; GFX7-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
4521; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4522; GFX7-NEXT:    s_waitcnt vmcnt(0)
4523; GFX7-NEXT:    s_setpc_b64 s[30:31]
4524  %gep = getelementptr double, ptr %ptr, i64 -256
4525  %result = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst
4526  ret double %result
4527}
4528
4529define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 {
4530; GFX12-LABEL: flat_agent_atomic_fsub_noret_f64:
4531; GFX12:       ; %bb.0:
4532; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4533; GFX12-NEXT:    s_wait_expcnt 0x0
4534; GFX12-NEXT:    s_wait_samplecnt 0x0
4535; GFX12-NEXT:    s_wait_bvhcnt 0x0
4536; GFX12-NEXT:    s_wait_kmcnt 0x0
4537; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
4538; GFX12-NEXT:    s_mov_b32 s0, exec_lo
4539; GFX12-NEXT:    s_wait_alu 0xfffe
4540; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
4541; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
4542; GFX12-NEXT:    s_cbranch_execnz .LBB19_3
4543; GFX12-NEXT:  ; %bb.1: ; %Flow3
4544; GFX12-NEXT:    s_wait_alu 0xfffe
4545; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
4546; GFX12-NEXT:    s_cbranch_execnz .LBB19_6
4547; GFX12-NEXT:  .LBB19_2: ; %atomicrmw.phi
4548; GFX12-NEXT:    s_wait_alu 0xfffe
4549; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4550; GFX12-NEXT:    s_wait_alu 0xfffe
4551; GFX12-NEXT:    s_setpc_b64 s[30:31]
4552; GFX12-NEXT:  .LBB19_3: ; %atomicrmw.global
4553; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
4554; GFX12-NEXT:    s_mov_b32 s1, 0
4555; GFX12-NEXT:  .LBB19_4: ; %atomicrmw.start
4556; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
4557; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4558; GFX12-NEXT:    v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
4559; GFX12-NEXT:    s_wait_storecnt 0x0
4560; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
4561; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4562; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4563; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
4564; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
4565; GFX12-NEXT:    s_wait_alu 0xfffe
4566; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
4567; GFX12-NEXT:    s_wait_alu 0xfffe
4568; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4569; GFX12-NEXT:    s_cbranch_execnz .LBB19_4
4570; GFX12-NEXT:  ; %bb.5: ; %Flow
4571; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4572; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
4573; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
4574; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
4575; GFX12-NEXT:    s_cbranch_execz .LBB19_2
4576; GFX12-NEXT:  .LBB19_6: ; %atomicrmw.private
4577; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
4578; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
4579; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
4580; GFX12-NEXT:    s_wait_loadcnt 0x0
4581; GFX12-NEXT:    v_add_f64_e64 v[0:1], v[0:1], -v[2:3]
4582; GFX12-NEXT:    scratch_store_b64 v4, v[0:1], off
4583; GFX12-NEXT:    s_wait_alu 0xfffe
4584; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4585; GFX12-NEXT:    s_wait_alu 0xfffe
4586; GFX12-NEXT:    s_setpc_b64 s[30:31]
4587;
4588; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64:
4589; GFX940:       ; %bb.0:
4590; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4591; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
4592; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
4593; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4594; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
4595; GFX940-NEXT:    s_cbranch_execnz .LBB19_3
4596; GFX940-NEXT:  ; %bb.1: ; %Flow3
4597; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
4598; GFX940-NEXT:    s_cbranch_execnz .LBB19_6
4599; GFX940-NEXT:  .LBB19_2: ; %atomicrmw.phi
4600; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
4601; GFX940-NEXT:    s_setpc_b64 s[30:31]
4602; GFX940-NEXT:  .LBB19_3: ; %atomicrmw.global
4603; GFX940-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
4604; GFX940-NEXT:    s_mov_b64 s[2:3], 0
4605; GFX940-NEXT:  .LBB19_4: ; %atomicrmw.start
4606; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
4607; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4608; GFX940-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
4609; GFX940-NEXT:    buffer_wbl2 sc1
4610; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
4611; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4612; GFX940-NEXT:    buffer_inv sc1
4613; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4614; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
4615; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
4616; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
4617; GFX940-NEXT:    s_cbranch_execnz .LBB19_4
4618; GFX940-NEXT:  ; %bb.5: ; %Flow
4619; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
4620; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
4621; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
4622; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
4623; GFX940-NEXT:    s_cbranch_execz .LBB19_2
4624; GFX940-NEXT:  .LBB19_6: ; %atomicrmw.private
4625; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4626; GFX940-NEXT:    s_nop 1
4627; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
4628; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
4629; GFX940-NEXT:    s_waitcnt vmcnt(0)
4630; GFX940-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
4631; GFX940-NEXT:    scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
4632; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
4633; GFX940-NEXT:    s_waitcnt vmcnt(0)
4634; GFX940-NEXT:    s_setpc_b64 s[30:31]
4635;
4636; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64:
4637; GFX11:       ; %bb.0:
4638; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4639; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
4640; GFX11-NEXT:    s_mov_b32 s0, exec_lo
4641; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
4642; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
4643; GFX11-NEXT:    s_cbranch_execnz .LBB19_3
4644; GFX11-NEXT:  ; %bb.1: ; %Flow3
4645; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
4646; GFX11-NEXT:    s_cbranch_execnz .LBB19_6
4647; GFX11-NEXT:  .LBB19_2: ; %atomicrmw.phi
4648; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4649; GFX11-NEXT:    s_setpc_b64 s[30:31]
4650; GFX11-NEXT:  .LBB19_3: ; %atomicrmw.global
4651; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
4652; GFX11-NEXT:    s_mov_b32 s1, 0
4653; GFX11-NEXT:  .LBB19_4: ; %atomicrmw.start
4654; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
4655; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4656; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
4657; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4658; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
4659; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4660; GFX11-NEXT:    buffer_gl1_inv
4661; GFX11-NEXT:    buffer_gl0_inv
4662; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
4663; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
4664; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
4665; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4666; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4667; GFX11-NEXT:    s_cbranch_execnz .LBB19_4
4668; GFX11-NEXT:  ; %bb.5: ; %Flow
4669; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4670; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
4671; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
4672; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
4673; GFX11-NEXT:    s_cbranch_execz .LBB19_2
4674; GFX11-NEXT:  .LBB19_6: ; %atomicrmw.private
4675; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
4676; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
4677; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
4678; GFX11-NEXT:    s_waitcnt vmcnt(0)
4679; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
4680; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off
4681; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4682; GFX11-NEXT:    s_setpc_b64 s[30:31]
4683;
4684; GFX10-LABEL: flat_agent_atomic_fsub_noret_f64:
4685; GFX10:       ; %bb.0:
4686; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4687; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
4688; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
4689; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4690; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
4691; GFX10-NEXT:    s_cbranch_execnz .LBB19_3
4692; GFX10-NEXT:  ; %bb.1: ; %Flow3
4693; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
4694; GFX10-NEXT:    s_cbranch_execnz .LBB19_6
4695; GFX10-NEXT:  .LBB19_2: ; %atomicrmw.phi
4696; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4697; GFX10-NEXT:    s_setpc_b64 s[30:31]
4698; GFX10-NEXT:  .LBB19_3: ; %atomicrmw.global
4699; GFX10-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
4700; GFX10-NEXT:    s_mov_b32 s5, 0
4701; GFX10-NEXT:  .LBB19_4: ; %atomicrmw.start
4702; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
4703; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4704; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
4705; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4706; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4707; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4708; GFX10-NEXT:    buffer_gl1_inv
4709; GFX10-NEXT:    buffer_gl0_inv
4710; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
4711; GFX10-NEXT:    v_mov_b32_e32 v7, v5
4712; GFX10-NEXT:    v_mov_b32_e32 v6, v4
4713; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
4714; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
4715; GFX10-NEXT:    s_cbranch_execnz .LBB19_4
4716; GFX10-NEXT:  ; %bb.5: ; %Flow
4717; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
4718; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
4719; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
4720; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
4721; GFX10-NEXT:    s_cbranch_execz .LBB19_2
4722; GFX10-NEXT:  .LBB19_6: ; %atomicrmw.private
4723; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
4724; GFX10-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
4725; GFX10-NEXT:    s_clause 0x1
4726; GFX10-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4727; GFX10-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
4728; GFX10-NEXT:    s_waitcnt vmcnt(0)
4729; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
4730; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4731; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4732; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4733; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4734; GFX10-NEXT:    s_setpc_b64 s[30:31]
4735;
4736; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64:
4737; GFX90A:       ; %bb.0:
4738; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4739; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
4740; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
4741; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4742; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4743; GFX90A-NEXT:    s_cbranch_execnz .LBB19_3
4744; GFX90A-NEXT:  ; %bb.1: ; %Flow3
4745; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4746; GFX90A-NEXT:    s_cbranch_execnz .LBB19_6
4747; GFX90A-NEXT:  .LBB19_2: ; %atomicrmw.phi
4748; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4749; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4750; GFX90A-NEXT:  .LBB19_3: ; %atomicrmw.global
4751; GFX90A-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
4752; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
4753; GFX90A-NEXT:  .LBB19_4: ; %atomicrmw.start
4754; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
4755; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4756; GFX90A-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
4757; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4758; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4759; GFX90A-NEXT:    buffer_wbinvl1
4760; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4761; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4762; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
4763; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4764; GFX90A-NEXT:    s_cbranch_execnz .LBB19_4
4765; GFX90A-NEXT:  ; %bb.5: ; %Flow
4766; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
4767; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
4768; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
4769; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4770; GFX90A-NEXT:    s_cbranch_execz .LBB19_2
4771; GFX90A-NEXT:  .LBB19_6: ; %atomicrmw.private
4772; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4773; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
4774; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4775; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
4776; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4777; GFX90A-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
4778; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4779; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4780; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4781; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4782; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4783;
4784; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64:
4785; GFX908:       ; %bb.0:
4786; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4787; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
4788; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
4789; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4790; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4791; GFX908-NEXT:    s_cbranch_execnz .LBB19_3
4792; GFX908-NEXT:  ; %bb.1: ; %Flow3
4793; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4794; GFX908-NEXT:    s_cbranch_execnz .LBB19_6
4795; GFX908-NEXT:  .LBB19_2: ; %atomicrmw.phi
4796; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4797; GFX908-NEXT:    s_setpc_b64 s[30:31]
4798; GFX908-NEXT:  .LBB19_3: ; %atomicrmw.global
4799; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
4800; GFX908-NEXT:    s_mov_b64 s[6:7], 0
4801; GFX908-NEXT:  .LBB19_4: ; %atomicrmw.start
4802; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
4803; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4804; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
4805; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4806; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4807; GFX908-NEXT:    buffer_wbinvl1
4808; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4809; GFX908-NEXT:    v_mov_b32_e32 v7, v5
4810; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4811; GFX908-NEXT:    v_mov_b32_e32 v6, v4
4812; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4813; GFX908-NEXT:    s_cbranch_execnz .LBB19_4
4814; GFX908-NEXT:  ; %bb.5: ; %Flow
4815; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
4816; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
4817; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
4818; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4819; GFX908-NEXT:    s_cbranch_execz .LBB19_2
4820; GFX908-NEXT:  .LBB19_6: ; %atomicrmw.private
4821; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4822; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
4823; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4824; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
4825; GFX908-NEXT:    s_waitcnt vmcnt(0)
4826; GFX908-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
4827; GFX908-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4828; GFX908-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4829; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4830; GFX908-NEXT:    s_waitcnt vmcnt(0)
4831; GFX908-NEXT:    s_setpc_b64 s[30:31]
4832;
4833; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64:
4834; GFX8:       ; %bb.0:
4835; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4836; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
4837; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
4838; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4839; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
4840; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4841; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4842; GFX8-NEXT:    s_cbranch_execnz .LBB19_3
4843; GFX8-NEXT:  ; %bb.1: ; %Flow3
4844; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4845; GFX8-NEXT:    s_cbranch_execnz .LBB19_6
4846; GFX8-NEXT:  .LBB19_2: ; %atomicrmw.phi
4847; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4848; GFX8-NEXT:    s_setpc_b64 s[30:31]
4849; GFX8-NEXT:  .LBB19_3: ; %atomicrmw.global
4850; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
4851; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
4852; GFX8-NEXT:    flat_load_dword v7, v[4:5]
4853; GFX8-NEXT:    flat_load_dword v6, v[0:1]
4854; GFX8-NEXT:    s_mov_b64 s[6:7], 0
4855; GFX8-NEXT:  .LBB19_4: ; %atomicrmw.start
4856; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
4857; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4858; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
4859; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4860; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4861; GFX8-NEXT:    buffer_wbinvl1
4862; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4863; GFX8-NEXT:    v_mov_b32_e32 v7, v5
4864; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4865; GFX8-NEXT:    v_mov_b32_e32 v6, v4
4866; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4867; GFX8-NEXT:    s_cbranch_execnz .LBB19_4
4868; GFX8-NEXT:  ; %bb.5: ; %Flow
4869; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
4870; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4871; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
4872; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4873; GFX8-NEXT:    s_cbranch_execz .LBB19_2
4874; GFX8-NEXT:  .LBB19_6: ; %atomicrmw.private
4875; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4876; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
4877; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
4878; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4879; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
4880; GFX8-NEXT:    s_waitcnt vmcnt(0)
4881; GFX8-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
4882; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4883; GFX8-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
4884; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4885; GFX8-NEXT:    s_waitcnt vmcnt(0)
4886; GFX8-NEXT:    s_setpc_b64 s[30:31]
4887;
4888; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64:
4889; GFX7:       ; %bb.0:
4890; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4891; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
4892; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
4893; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4894; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
4895; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4896; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4897; GFX7-NEXT:    s_cbranch_execnz .LBB19_3
4898; GFX7-NEXT:  ; %bb.1: ; %Flow3
4899; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4900; GFX7-NEXT:    s_cbranch_execnz .LBB19_6
4901; GFX7-NEXT:  .LBB19_2: ; %atomicrmw.phi
4902; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4903; GFX7-NEXT:    s_setpc_b64 s[30:31]
4904; GFX7-NEXT:  .LBB19_3: ; %atomicrmw.global
4905; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
4906; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
4907; GFX7-NEXT:    flat_load_dword v7, v[4:5]
4908; GFX7-NEXT:    flat_load_dword v6, v[0:1]
4909; GFX7-NEXT:    s_mov_b64 s[6:7], 0
4910; GFX7-NEXT:  .LBB19_4: ; %atomicrmw.start
4911; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
4912; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4913; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
4914; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4915; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4916; GFX7-NEXT:    buffer_wbinvl1
4917; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4918; GFX7-NEXT:    v_mov_b32_e32 v7, v5
4919; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4920; GFX7-NEXT:    v_mov_b32_e32 v6, v4
4921; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4922; GFX7-NEXT:    s_cbranch_execnz .LBB19_4
4923; GFX7-NEXT:  ; %bb.5: ; %Flow
4924; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
4925; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
4926; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
4927; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4928; GFX7-NEXT:    s_cbranch_execz .LBB19_2
4929; GFX7-NEXT:  .LBB19_6: ; %atomicrmw.private
4930; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4931; GFX7-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
4932; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v4
4933; GFX7-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4934; GFX7-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
4935; GFX7-NEXT:    s_waitcnt vmcnt(0)
4936; GFX7-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
4937; GFX7-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4938; GFX7-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
4939; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4940; GFX7-NEXT:    s_waitcnt vmcnt(0)
4941; GFX7-NEXT:    s_setpc_b64 s[30:31]
4942  %unused = atomicrmw fsub ptr %ptr, double %val syncscope("agent") seq_cst
4943  ret void
4944}
4945
4946define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %val) #0 {
4947; GFX12-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos:
4948; GFX12:       ; %bb.0:
4949; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4950; GFX12-NEXT:    s_wait_expcnt 0x0
4951; GFX12-NEXT:    s_wait_samplecnt 0x0
4952; GFX12-NEXT:    s_wait_bvhcnt 0x0
4953; GFX12-NEXT:    s_wait_kmcnt 0x0
4954; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
4955; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
4956; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
4957; GFX12-NEXT:    s_mov_b32 s0, exec_lo
4958; GFX12-NEXT:    s_wait_alu 0xfffe
4959; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4960; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
4961; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
4962; GFX12-NEXT:    s_cbranch_execnz .LBB20_3
4963; GFX12-NEXT:  ; %bb.1: ; %Flow3
4964; GFX12-NEXT:    s_wait_alu 0xfffe
4965; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
4966; GFX12-NEXT:    s_cbranch_execnz .LBB20_6
4967; GFX12-NEXT:  .LBB20_2: ; %atomicrmw.phi
4968; GFX12-NEXT:    s_wait_alu 0xfffe
4969; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4970; GFX12-NEXT:    s_wait_alu 0xfffe
4971; GFX12-NEXT:    s_setpc_b64 s[30:31]
4972; GFX12-NEXT:  .LBB20_3: ; %atomicrmw.global
4973; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
4974; GFX12-NEXT:    s_mov_b32 s1, 0
4975; GFX12-NEXT:  .LBB20_4: ; %atomicrmw.start
4976; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
4977; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4978; GFX12-NEXT:    v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
4979; GFX12-NEXT:    s_wait_storecnt 0x0
4980; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
4981; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4982; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4983; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
4984; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
4985; GFX12-NEXT:    s_wait_alu 0xfffe
4986; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
4987; GFX12-NEXT:    s_wait_alu 0xfffe
4988; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4989; GFX12-NEXT:    s_cbranch_execnz .LBB20_4
4990; GFX12-NEXT:  ; %bb.5: ; %Flow
4991; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4992; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
4993; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
4994; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
4995; GFX12-NEXT:    s_cbranch_execz .LBB20_2
4996; GFX12-NEXT:  .LBB20_6: ; %atomicrmw.private
4997; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
4998; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
4999; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
5000; GFX12-NEXT:    s_wait_loadcnt 0x0
5001; GFX12-NEXT:    v_add_f64_e64 v[0:1], v[0:1], -v[2:3]
5002; GFX12-NEXT:    scratch_store_b64 v4, v[0:1], off
5003; GFX12-NEXT:    s_wait_alu 0xfffe
5004; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5005; GFX12-NEXT:    s_wait_alu 0xfffe
5006; GFX12-NEXT:    s_setpc_b64 s[30:31]
5007;
5008; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos:
5009; GFX940:       ; %bb.0:
5010; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5011; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7f8
5012; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
5013; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
5014; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
5015; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
5016; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
5017; GFX940-NEXT:    s_cbranch_execnz .LBB20_3
5018; GFX940-NEXT:  ; %bb.1: ; %Flow3
5019; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
5020; GFX940-NEXT:    s_cbranch_execnz .LBB20_6
5021; GFX940-NEXT:  .LBB20_2: ; %atomicrmw.phi
5022; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
5023; GFX940-NEXT:    s_setpc_b64 s[30:31]
5024; GFX940-NEXT:  .LBB20_3: ; %atomicrmw.global
5025; GFX940-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
5026; GFX940-NEXT:    s_mov_b64 s[2:3], 0
5027; GFX940-NEXT:  .LBB20_4: ; %atomicrmw.start
5028; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
5029; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5030; GFX940-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5031; GFX940-NEXT:    buffer_wbl2 sc1
5032; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
5033; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5034; GFX940-NEXT:    buffer_inv sc1
5035; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5036; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
5037; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
5038; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
5039; GFX940-NEXT:    s_cbranch_execnz .LBB20_4
5040; GFX940-NEXT:  ; %bb.5: ; %Flow
5041; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
5042; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
5043; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
5044; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
5045; GFX940-NEXT:    s_cbranch_execz .LBB20_2
5046; GFX940-NEXT:  .LBB20_6: ; %atomicrmw.private
5047; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5048; GFX940-NEXT:    s_nop 1
5049; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
5050; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
5051; GFX940-NEXT:    s_waitcnt vmcnt(0)
5052; GFX940-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5053; GFX940-NEXT:    scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
5054; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
5055; GFX940-NEXT:    s_waitcnt vmcnt(0)
5056; GFX940-NEXT:    s_setpc_b64 s[30:31]
5057;
5058; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos:
5059; GFX11:       ; %bb.0:
5060; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5061; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
5062; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
5063; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
5064; GFX11-NEXT:    s_mov_b32 s0, exec_lo
5065; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5066; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
5067; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
5068; GFX11-NEXT:    s_cbranch_execnz .LBB20_3
5069; GFX11-NEXT:  ; %bb.1: ; %Flow3
5070; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
5071; GFX11-NEXT:    s_cbranch_execnz .LBB20_6
5072; GFX11-NEXT:  .LBB20_2: ; %atomicrmw.phi
5073; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5074; GFX11-NEXT:    s_setpc_b64 s[30:31]
5075; GFX11-NEXT:  .LBB20_3: ; %atomicrmw.global
5076; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
5077; GFX11-NEXT:    s_mov_b32 s1, 0
5078; GFX11-NEXT:  .LBB20_4: ; %atomicrmw.start
5079; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
5080; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5081; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5082; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5083; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
5084; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5085; GFX11-NEXT:    buffer_gl1_inv
5086; GFX11-NEXT:    buffer_gl0_inv
5087; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
5088; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
5089; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
5090; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5091; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
5092; GFX11-NEXT:    s_cbranch_execnz .LBB20_4
5093; GFX11-NEXT:  ; %bb.5: ; %Flow
5094; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
5095; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
5096; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
5097; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
5098; GFX11-NEXT:    s_cbranch_execz .LBB20_2
5099; GFX11-NEXT:  .LBB20_6: ; %atomicrmw.private
5100; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
5101; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
5102; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
5103; GFX11-NEXT:    s_waitcnt vmcnt(0)
5104; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5105; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off
5106; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5107; GFX11-NEXT:    s_setpc_b64 s[30:31]
5108;
5109; GFX10-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos:
5110; GFX10:       ; %bb.0:
5111; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5112; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
5113; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
5114; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
5115; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
5116; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
5117; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
5118; GFX10-NEXT:    s_cbranch_execnz .LBB20_3
5119; GFX10-NEXT:  ; %bb.1: ; %Flow3
5120; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
5121; GFX10-NEXT:    s_cbranch_execnz .LBB20_6
5122; GFX10-NEXT:  .LBB20_2: ; %atomicrmw.phi
5123; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5124; GFX10-NEXT:    s_setpc_b64 s[30:31]
5125; GFX10-NEXT:  .LBB20_3: ; %atomicrmw.global
5126; GFX10-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
5127; GFX10-NEXT:    s_mov_b32 s5, 0
5128; GFX10-NEXT:  .LBB20_4: ; %atomicrmw.start
5129; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
5130; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5131; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5132; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5133; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5134; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5135; GFX10-NEXT:    buffer_gl1_inv
5136; GFX10-NEXT:    buffer_gl0_inv
5137; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
5138; GFX10-NEXT:    v_mov_b32_e32 v7, v5
5139; GFX10-NEXT:    v_mov_b32_e32 v6, v4
5140; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
5141; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
5142; GFX10-NEXT:    s_cbranch_execnz .LBB20_4
5143; GFX10-NEXT:  ; %bb.5: ; %Flow
5144; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
5145; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
5146; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
5147; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
5148; GFX10-NEXT:    s_cbranch_execz .LBB20_2
5149; GFX10-NEXT:  .LBB20_6: ; %atomicrmw.private
5150; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
5151; GFX10-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
5152; GFX10-NEXT:    s_clause 0x1
5153; GFX10-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
5154; GFX10-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
5155; GFX10-NEXT:    s_waitcnt vmcnt(0)
5156; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5157; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
5158; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
5159; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
5160; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5161; GFX10-NEXT:    s_setpc_b64 s[30:31]
5162;
5163; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos:
5164; GFX90A:       ; %bb.0:
5165; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5166; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
5167; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
5168; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
5169; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
5170; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5171; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5172; GFX90A-NEXT:    s_cbranch_execnz .LBB20_3
5173; GFX90A-NEXT:  ; %bb.1: ; %Flow3
5174; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5175; GFX90A-NEXT:    s_cbranch_execnz .LBB20_6
5176; GFX90A-NEXT:  .LBB20_2: ; %atomicrmw.phi
5177; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5178; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5179; GFX90A-NEXT:  .LBB20_3: ; %atomicrmw.global
5180; GFX90A-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
5181; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
5182; GFX90A-NEXT:  .LBB20_4: ; %atomicrmw.start
5183; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
5184; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5185; GFX90A-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5186; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5187; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5188; GFX90A-NEXT:    buffer_wbinvl1
5189; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5190; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5191; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
5192; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5193; GFX90A-NEXT:    s_cbranch_execnz .LBB20_4
5194; GFX90A-NEXT:  ; %bb.5: ; %Flow
5195; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
5196; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
5197; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
5198; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5199; GFX90A-NEXT:    s_cbranch_execz .LBB20_2
5200; GFX90A-NEXT:  .LBB20_6: ; %atomicrmw.private
5201; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5202; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
5203; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
5204; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
5205; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5206; GFX90A-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5207; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
5208; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
5209; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5210; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5211; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5212;
5213; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos:
5214; GFX908:       ; %bb.0:
5215; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5216; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
5217; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
5218; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
5219; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
5220; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5221; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5222; GFX908-NEXT:    s_cbranch_execnz .LBB20_3
5223; GFX908-NEXT:  ; %bb.1: ; %Flow3
5224; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5225; GFX908-NEXT:    s_cbranch_execnz .LBB20_6
5226; GFX908-NEXT:  .LBB20_2: ; %atomicrmw.phi
5227; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5228; GFX908-NEXT:    s_setpc_b64 s[30:31]
5229; GFX908-NEXT:  .LBB20_3: ; %atomicrmw.global
5230; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
5231; GFX908-NEXT:    s_mov_b64 s[6:7], 0
5232; GFX908-NEXT:  .LBB20_4: ; %atomicrmw.start
5233; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
5234; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5235; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5236; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5237; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5238; GFX908-NEXT:    buffer_wbinvl1
5239; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5240; GFX908-NEXT:    v_mov_b32_e32 v7, v5
5241; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5242; GFX908-NEXT:    v_mov_b32_e32 v6, v4
5243; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5244; GFX908-NEXT:    s_cbranch_execnz .LBB20_4
5245; GFX908-NEXT:  ; %bb.5: ; %Flow
5246; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
5247; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
5248; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
5249; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5250; GFX908-NEXT:    s_cbranch_execz .LBB20_2
5251; GFX908-NEXT:  .LBB20_6: ; %atomicrmw.private
5252; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5253; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
5254; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
5255; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
5256; GFX908-NEXT:    s_waitcnt vmcnt(0)
5257; GFX908-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5258; GFX908-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
5259; GFX908-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
5260; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5261; GFX908-NEXT:    s_waitcnt vmcnt(0)
5262; GFX908-NEXT:    s_setpc_b64 s[30:31]
5263;
5264; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos:
5265; GFX8:       ; %bb.0:
5266; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5267; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
5268; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
5269; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
5270; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5271; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5272; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
5273; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5274; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5275; GFX8-NEXT:    s_cbranch_execnz .LBB20_3
5276; GFX8-NEXT:  ; %bb.1: ; %Flow3
5277; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5278; GFX8-NEXT:    s_cbranch_execnz .LBB20_6
5279; GFX8-NEXT:  .LBB20_2: ; %atomicrmw.phi
5280; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5281; GFX8-NEXT:    s_setpc_b64 s[30:31]
5282; GFX8-NEXT:  .LBB20_3: ; %atomicrmw.global
5283; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
5284; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
5285; GFX8-NEXT:    flat_load_dword v7, v[4:5]
5286; GFX8-NEXT:    flat_load_dword v6, v[0:1]
5287; GFX8-NEXT:    s_mov_b64 s[6:7], 0
5288; GFX8-NEXT:  .LBB20_4: ; %atomicrmw.start
5289; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5290; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5291; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5292; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5293; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5294; GFX8-NEXT:    buffer_wbinvl1
5295; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5296; GFX8-NEXT:    v_mov_b32_e32 v7, v5
5297; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5298; GFX8-NEXT:    v_mov_b32_e32 v6, v4
5299; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5300; GFX8-NEXT:    s_cbranch_execnz .LBB20_4
5301; GFX8-NEXT:  ; %bb.5: ; %Flow
5302; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
5303; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5304; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
5305; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5306; GFX8-NEXT:    s_cbranch_execz .LBB20_2
5307; GFX8-NEXT:  .LBB20_6: ; %atomicrmw.private
5308; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5309; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
5310; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
5311; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
5312; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
5313; GFX8-NEXT:    s_waitcnt vmcnt(0)
5314; GFX8-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5315; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
5316; GFX8-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
5317; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5318; GFX8-NEXT:    s_waitcnt vmcnt(0)
5319; GFX8-NEXT:    s_setpc_b64 s[30:31]
5320;
5321; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos:
5322; GFX7:       ; %bb.0:
5323; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5324; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
5325; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
5326; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7f8, v0
5327; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5328; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5329; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
5330; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5331; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5332; GFX7-NEXT:    s_cbranch_execnz .LBB20_3
5333; GFX7-NEXT:  ; %bb.1: ; %Flow3
5334; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5335; GFX7-NEXT:    s_cbranch_execnz .LBB20_6
5336; GFX7-NEXT:  .LBB20_2: ; %atomicrmw.phi
5337; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5338; GFX7-NEXT:    s_setpc_b64 s[30:31]
5339; GFX7-NEXT:  .LBB20_3: ; %atomicrmw.global
5340; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
5341; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
5342; GFX7-NEXT:    flat_load_dword v7, v[4:5]
5343; GFX7-NEXT:    flat_load_dword v6, v[0:1]
5344; GFX7-NEXT:    s_mov_b64 s[6:7], 0
5345; GFX7-NEXT:  .LBB20_4: ; %atomicrmw.start
5346; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
5347; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5348; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5349; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5350; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5351; GFX7-NEXT:    buffer_wbinvl1
5352; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5353; GFX7-NEXT:    v_mov_b32_e32 v7, v5
5354; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5355; GFX7-NEXT:    v_mov_b32_e32 v6, v4
5356; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5357; GFX7-NEXT:    s_cbranch_execnz .LBB20_4
5358; GFX7-NEXT:  ; %bb.5: ; %Flow
5359; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
5360; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
5361; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
5362; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5363; GFX7-NEXT:    s_cbranch_execz .LBB20_2
5364; GFX7-NEXT:  .LBB20_6: ; %atomicrmw.private
5365; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5366; GFX7-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
5367; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v4
5368; GFX7-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
5369; GFX7-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
5370; GFX7-NEXT:    s_waitcnt vmcnt(0)
5371; GFX7-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5372; GFX7-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
5373; GFX7-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
5374; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5375; GFX7-NEXT:    s_waitcnt vmcnt(0)
5376; GFX7-NEXT:    s_setpc_b64 s[30:31]
5377  %gep = getelementptr double, ptr %ptr, i64 255
5378  %unused = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst
5379  ret void
5380}
5381
5382define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %val) #0 {
5383; GFX12-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg:
5384; GFX12:       ; %bb.0:
5385; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5386; GFX12-NEXT:    s_wait_expcnt 0x0
5387; GFX12-NEXT:    s_wait_samplecnt 0x0
5388; GFX12-NEXT:    s_wait_bvhcnt 0x0
5389; GFX12-NEXT:    s_wait_kmcnt 0x0
5390; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
5391; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
5392; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
5393; GFX12-NEXT:    s_mov_b32 s0, exec_lo
5394; GFX12-NEXT:    s_wait_alu 0xfffe
5395; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5396; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
5397; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
5398; GFX12-NEXT:    s_cbranch_execnz .LBB21_3
5399; GFX12-NEXT:  ; %bb.1: ; %Flow3
5400; GFX12-NEXT:    s_wait_alu 0xfffe
5401; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
5402; GFX12-NEXT:    s_cbranch_execnz .LBB21_6
5403; GFX12-NEXT:  .LBB21_2: ; %atomicrmw.phi
5404; GFX12-NEXT:    s_wait_alu 0xfffe
5405; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5406; GFX12-NEXT:    s_wait_alu 0xfffe
5407; GFX12-NEXT:    s_setpc_b64 s[30:31]
5408; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.global
5409; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
5410; GFX12-NEXT:    s_mov_b32 s1, 0
5411; GFX12-NEXT:  .LBB21_4: ; %atomicrmw.start
5412; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
5413; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5414; GFX12-NEXT:    v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
5415; GFX12-NEXT:    s_wait_storecnt 0x0
5416; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
5417; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5418; GFX12-NEXT:    global_inv scope:SCOPE_DEV
5419; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
5420; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
5421; GFX12-NEXT:    s_wait_alu 0xfffe
5422; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
5423; GFX12-NEXT:    s_wait_alu 0xfffe
5424; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
5425; GFX12-NEXT:    s_cbranch_execnz .LBB21_4
5426; GFX12-NEXT:  ; %bb.5: ; %Flow
5427; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
5428; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
5429; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
5430; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
5431; GFX12-NEXT:    s_cbranch_execz .LBB21_2
5432; GFX12-NEXT:  .LBB21_6: ; %atomicrmw.private
5433; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
5434; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
5435; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
5436; GFX12-NEXT:    s_wait_loadcnt 0x0
5437; GFX12-NEXT:    v_add_f64_e64 v[0:1], v[0:1], -v[2:3]
5438; GFX12-NEXT:    scratch_store_b64 v4, v[0:1], off
5439; GFX12-NEXT:    s_wait_alu 0xfffe
5440; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5441; GFX12-NEXT:    s_wait_alu 0xfffe
5442; GFX12-NEXT:    s_setpc_b64 s[30:31]
5443;
5444; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg:
5445; GFX940:       ; %bb.0:
5446; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5447; GFX940-NEXT:    s_movk_i32 s0, 0xf800
5448; GFX940-NEXT:    s_mov_b32 s1, -1
5449; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
5450; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
5451; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
5452; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
5453; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
5454; GFX940-NEXT:    s_cbranch_execnz .LBB21_3
5455; GFX940-NEXT:  ; %bb.1: ; %Flow3
5456; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
5457; GFX940-NEXT:    s_cbranch_execnz .LBB21_6
5458; GFX940-NEXT:  .LBB21_2: ; %atomicrmw.phi
5459; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
5460; GFX940-NEXT:    s_setpc_b64 s[30:31]
5461; GFX940-NEXT:  .LBB21_3: ; %atomicrmw.global
5462; GFX940-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
5463; GFX940-NEXT:    s_mov_b64 s[2:3], 0
5464; GFX940-NEXT:  .LBB21_4: ; %atomicrmw.start
5465; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
5466; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5467; GFX940-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5468; GFX940-NEXT:    buffer_wbl2 sc1
5469; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
5470; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5471; GFX940-NEXT:    buffer_inv sc1
5472; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5473; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
5474; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
5475; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
5476; GFX940-NEXT:    s_cbranch_execnz .LBB21_4
5477; GFX940-NEXT:  ; %bb.5: ; %Flow
5478; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
5479; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
5480; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
5481; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
5482; GFX940-NEXT:    s_cbranch_execz .LBB21_2
5483; GFX940-NEXT:  .LBB21_6: ; %atomicrmw.private
5484; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5485; GFX940-NEXT:    s_nop 1
5486; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
5487; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
5488; GFX940-NEXT:    s_waitcnt vmcnt(0)
5489; GFX940-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5490; GFX940-NEXT:    scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
5491; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
5492; GFX940-NEXT:    s_waitcnt vmcnt(0)
5493; GFX940-NEXT:    s_setpc_b64 s[30:31]
5494;
5495; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg:
5496; GFX11:       ; %bb.0:
5497; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5498; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
5499; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
5500; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
5501; GFX11-NEXT:    s_mov_b32 s0, exec_lo
5502; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5503; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
5504; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
5505; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
5506; GFX11-NEXT:  ; %bb.1: ; %Flow3
5507; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
5508; GFX11-NEXT:    s_cbranch_execnz .LBB21_6
5509; GFX11-NEXT:  .LBB21_2: ; %atomicrmw.phi
5510; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5511; GFX11-NEXT:    s_setpc_b64 s[30:31]
5512; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.global
5513; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
5514; GFX11-NEXT:    s_mov_b32 s1, 0
5515; GFX11-NEXT:  .LBB21_4: ; %atomicrmw.start
5516; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
5517; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5518; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5519; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5520; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
5521; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5522; GFX11-NEXT:    buffer_gl1_inv
5523; GFX11-NEXT:    buffer_gl0_inv
5524; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
5525; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
5526; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
5527; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5528; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
5529; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
5530; GFX11-NEXT:  ; %bb.5: ; %Flow
5531; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
5532; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
5533; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
5534; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
5535; GFX11-NEXT:    s_cbranch_execz .LBB21_2
5536; GFX11-NEXT:  .LBB21_6: ; %atomicrmw.private
5537; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
5538; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
5539; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
5540; GFX11-NEXT:    s_waitcnt vmcnt(0)
5541; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5542; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off
5543; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5544; GFX11-NEXT:    s_setpc_b64 s[30:31]
5545;
5546; GFX10-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg:
5547; GFX10:       ; %bb.0:
5548; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5549; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
5550; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
5551; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
5552; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
5553; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
5554; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
5555; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
5556; GFX10-NEXT:  ; %bb.1: ; %Flow3
5557; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
5558; GFX10-NEXT:    s_cbranch_execnz .LBB21_6
5559; GFX10-NEXT:  .LBB21_2: ; %atomicrmw.phi
5560; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5561; GFX10-NEXT:    s_setpc_b64 s[30:31]
5562; GFX10-NEXT:  .LBB21_3: ; %atomicrmw.global
5563; GFX10-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
5564; GFX10-NEXT:    s_mov_b32 s5, 0
5565; GFX10-NEXT:  .LBB21_4: ; %atomicrmw.start
5566; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
5567; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5568; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5569; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5570; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5571; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5572; GFX10-NEXT:    buffer_gl1_inv
5573; GFX10-NEXT:    buffer_gl0_inv
5574; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
5575; GFX10-NEXT:    v_mov_b32_e32 v7, v5
5576; GFX10-NEXT:    v_mov_b32_e32 v6, v4
5577; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
5578; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
5579; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
5580; GFX10-NEXT:  ; %bb.5: ; %Flow
5581; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
5582; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
5583; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
5584; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
5585; GFX10-NEXT:    s_cbranch_execz .LBB21_2
5586; GFX10-NEXT:  .LBB21_6: ; %atomicrmw.private
5587; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
5588; GFX10-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
5589; GFX10-NEXT:    s_clause 0x1
5590; GFX10-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
5591; GFX10-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
5592; GFX10-NEXT:    s_waitcnt vmcnt(0)
5593; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5594; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
5595; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
5596; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
5597; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5598; GFX10-NEXT:    s_setpc_b64 s[30:31]
5599;
5600; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg:
5601; GFX90A:       ; %bb.0:
5602; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5603; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
5604; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
5605; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
5606; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
5607; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5608; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5609; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
5610; GFX90A-NEXT:  ; %bb.1: ; %Flow3
5611; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5612; GFX90A-NEXT:    s_cbranch_execnz .LBB21_6
5613; GFX90A-NEXT:  .LBB21_2: ; %atomicrmw.phi
5614; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5615; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5616; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.global
5617; GFX90A-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
5618; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
5619; GFX90A-NEXT:  .LBB21_4: ; %atomicrmw.start
5620; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
5621; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5622; GFX90A-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5623; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5624; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5625; GFX90A-NEXT:    buffer_wbinvl1
5626; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5627; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5628; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
5629; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5630; GFX90A-NEXT:    s_cbranch_execnz .LBB21_4
5631; GFX90A-NEXT:  ; %bb.5: ; %Flow
5632; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
5633; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
5634; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
5635; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5636; GFX90A-NEXT:    s_cbranch_execz .LBB21_2
5637; GFX90A-NEXT:  .LBB21_6: ; %atomicrmw.private
5638; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5639; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
5640; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
5641; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
5642; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5643; GFX90A-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5644; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
5645; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
5646; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5647; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5648; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5649;
5650; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg:
5651; GFX908:       ; %bb.0:
5652; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5653; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
5654; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
5655; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
5656; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
5657; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5658; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5659; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
5660; GFX908-NEXT:  ; %bb.1: ; %Flow3
5661; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5662; GFX908-NEXT:    s_cbranch_execnz .LBB21_6
5663; GFX908-NEXT:  .LBB21_2: ; %atomicrmw.phi
5664; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5665; GFX908-NEXT:    s_setpc_b64 s[30:31]
5666; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.global
5667; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
5668; GFX908-NEXT:    s_mov_b64 s[6:7], 0
5669; GFX908-NEXT:  .LBB21_4: ; %atomicrmw.start
5670; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
5671; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5672; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5673; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5674; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5675; GFX908-NEXT:    buffer_wbinvl1
5676; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5677; GFX908-NEXT:    v_mov_b32_e32 v7, v5
5678; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5679; GFX908-NEXT:    v_mov_b32_e32 v6, v4
5680; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5681; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
5682; GFX908-NEXT:  ; %bb.5: ; %Flow
5683; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
5684; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
5685; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
5686; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5687; GFX908-NEXT:    s_cbranch_execz .LBB21_2
5688; GFX908-NEXT:  .LBB21_6: ; %atomicrmw.private
5689; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5690; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
5691; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
5692; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
5693; GFX908-NEXT:    s_waitcnt vmcnt(0)
5694; GFX908-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5695; GFX908-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
5696; GFX908-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
5697; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5698; GFX908-NEXT:    s_waitcnt vmcnt(0)
5699; GFX908-NEXT:    s_setpc_b64 s[30:31]
5700;
5701; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg:
5702; GFX8:       ; %bb.0:
5703; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5704; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
5705; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
5706; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
5707; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
5708; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5709; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
5710; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5711; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5712; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
5713; GFX8-NEXT:  ; %bb.1: ; %Flow3
5714; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5715; GFX8-NEXT:    s_cbranch_execnz .LBB21_6
5716; GFX8-NEXT:  .LBB21_2: ; %atomicrmw.phi
5717; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5718; GFX8-NEXT:    s_setpc_b64 s[30:31]
5719; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.global
5720; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
5721; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
5722; GFX8-NEXT:    flat_load_dword v7, v[4:5]
5723; GFX8-NEXT:    flat_load_dword v6, v[0:1]
5724; GFX8-NEXT:    s_mov_b64 s[6:7], 0
5725; GFX8-NEXT:  .LBB21_4: ; %atomicrmw.start
5726; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5727; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5728; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5729; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5730; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5731; GFX8-NEXT:    buffer_wbinvl1
5732; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5733; GFX8-NEXT:    v_mov_b32_e32 v7, v5
5734; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5735; GFX8-NEXT:    v_mov_b32_e32 v6, v4
5736; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5737; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
5738; GFX8-NEXT:  ; %bb.5: ; %Flow
5739; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
5740; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5741; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
5742; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5743; GFX8-NEXT:    s_cbranch_execz .LBB21_2
5744; GFX8-NEXT:  .LBB21_6: ; %atomicrmw.private
5745; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5746; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
5747; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
5748; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
5749; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
5750; GFX8-NEXT:    s_waitcnt vmcnt(0)
5751; GFX8-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5752; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
5753; GFX8-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
5754; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5755; GFX8-NEXT:    s_waitcnt vmcnt(0)
5756; GFX8-NEXT:    s_setpc_b64 s[30:31]
5757;
5758; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg:
5759; GFX7:       ; %bb.0:
5760; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5761; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
5762; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
5763; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
5764; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
5765; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5766; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
5767; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5768; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5769; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
5770; GFX7-NEXT:  ; %bb.1: ; %Flow3
5771; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5772; GFX7-NEXT:    s_cbranch_execnz .LBB21_6
5773; GFX7-NEXT:  .LBB21_2: ; %atomicrmw.phi
5774; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5775; GFX7-NEXT:    s_setpc_b64 s[30:31]
5776; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.global
5777; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
5778; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
5779; GFX7-NEXT:    flat_load_dword v7, v[4:5]
5780; GFX7-NEXT:    flat_load_dword v6, v[0:1]
5781; GFX7-NEXT:    s_mov_b64 s[6:7], 0
5782; GFX7-NEXT:  .LBB21_4: ; %atomicrmw.start
5783; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
5784; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5785; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
5786; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5787; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5788; GFX7-NEXT:    buffer_wbinvl1
5789; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5790; GFX7-NEXT:    v_mov_b32_e32 v7, v5
5791; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5792; GFX7-NEXT:    v_mov_b32_e32 v6, v4
5793; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5794; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
5795; GFX7-NEXT:  ; %bb.5: ; %Flow
5796; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
5797; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
5798; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
5799; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5800; GFX7-NEXT:    s_cbranch_execz .LBB21_2
5801; GFX7-NEXT:  .LBB21_6: ; %atomicrmw.private
5802; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5803; GFX7-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
5804; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v4
5805; GFX7-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
5806; GFX7-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
5807; GFX7-NEXT:    s_waitcnt vmcnt(0)
5808; GFX7-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
5809; GFX7-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
5810; GFX7-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
5811; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5812; GFX7-NEXT:    s_waitcnt vmcnt(0)
5813; GFX7-NEXT:    s_setpc_b64 s[30:31]
5814  %gep = getelementptr double, ptr %ptr, i64 -256
5815  %unused = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst
5816  ret void
5817}
5818
5819; --------------------------------------------------------------------
5820; half
5821; --------------------------------------------------------------------
5822
5823define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
5824; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16:
5825; GFX12:       ; %bb.0:
5826; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5827; GFX12-NEXT:    s_wait_expcnt 0x0
5828; GFX12-NEXT:    s_wait_samplecnt 0x0
5829; GFX12-NEXT:    s_wait_bvhcnt 0x0
5830; GFX12-NEXT:    s_wait_kmcnt 0x0
5831; GFX12-NEXT:    v_mov_b32_e32 v3, v0
5832; GFX12-NEXT:    s_mov_b32 s0, 0
5833; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
5834; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
5835; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
5836; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
5837; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
5838; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
5839; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5840; GFX12-NEXT:    v_not_b32_e32 v4, v4
5841; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
5842; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
5843; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5844; GFX12-NEXT:    v_mov_b32_e32 v6, v5
5845; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5846; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
5847; GFX12-NEXT:    v_sub_f16_e32 v5, v5, v2
5848; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5849; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
5850; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
5851; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5852; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
5853; GFX12-NEXT:    s_wait_storecnt 0x0
5854; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
5855; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5856; GFX12-NEXT:    global_inv scope:SCOPE_DEV
5857; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
5858; GFX12-NEXT:    s_wait_alu 0xfffe
5859; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
5860; GFX12-NEXT:    s_wait_alu 0xfffe
5861; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
5862; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
5863; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
5864; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5865; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
5866; GFX12-NEXT:    s_wait_alu 0xfffe
5867; GFX12-NEXT:    s_setpc_b64 s[30:31]
5868;
5869; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16:
5870; GFX940:       ; %bb.0:
5871; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5872; GFX940-NEXT:    v_mov_b32_e32 v3, v0
5873; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
5874; GFX940-NEXT:    flat_load_dword v4, v[0:1]
5875; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
5876; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
5877; GFX940-NEXT:    s_mov_b32 s0, 0xffff
5878; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
5879; GFX940-NEXT:    v_not_b32_e32 v5, v5
5880; GFX940-NEXT:    s_mov_b64 s[0:1], 0
5881; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
5882; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
5883; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5884; GFX940-NEXT:    v_mov_b32_e32 v7, v4
5885; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
5886; GFX940-NEXT:    v_sub_f16_e32 v4, v4, v2
5887; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
5888; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
5889; GFX940-NEXT:    buffer_wbl2 sc1
5890; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
5891; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5892; GFX940-NEXT:    buffer_inv sc1
5893; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
5894; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5895; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5896; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
5897; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
5898; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
5899; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
5900; GFX940-NEXT:    s_setpc_b64 s[30:31]
5901;
5902; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16:
5903; GFX11:       ; %bb.0:
5904; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5905; GFX11-NEXT:    v_mov_b32_e32 v3, v0
5906; GFX11-NEXT:    s_mov_b32 s0, 0
5907; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
5908; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
5909; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
5910; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
5911; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
5912; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
5913; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5914; GFX11-NEXT:    v_not_b32_e32 v4, v4
5915; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
5916; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
5917; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5918; GFX11-NEXT:    v_mov_b32_e32 v6, v5
5919; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5920; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
5921; GFX11-NEXT:    v_sub_f16_e32 v5, v5, v2
5922; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5923; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
5924; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
5925; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5926; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
5927; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5928; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
5929; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5930; GFX11-NEXT:    buffer_gl1_inv
5931; GFX11-NEXT:    buffer_gl0_inv
5932; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
5933; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
5934; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5935; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
5936; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
5937; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
5938; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5939; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
5940; GFX11-NEXT:    s_setpc_b64 s[30:31]
5941;
5942; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16:
5943; GFX10:       ; %bb.0:
5944; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5945; GFX10-NEXT:    v_mov_b32_e32 v3, v0
5946; GFX10-NEXT:    s_mov_b32 s4, 0
5947; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
5948; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
5949; GFX10-NEXT:    flat_load_dword v5, v[0:1]
5950; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
5951; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
5952; GFX10-NEXT:    v_not_b32_e32 v4, v4
5953; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
5954; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
5955; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5956; GFX10-NEXT:    v_mov_b32_e32 v6, v5
5957; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
5958; GFX10-NEXT:    v_sub_f16_e32 v5, v5, v2
5959; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
5960; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
5961; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5962; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
5963; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5964; GFX10-NEXT:    buffer_gl1_inv
5965; GFX10-NEXT:    buffer_gl0_inv
5966; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
5967; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
5968; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
5969; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
5970; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
5971; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5972; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
5973; GFX10-NEXT:    s_setpc_b64 s[30:31]
5974;
5975; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f16:
5976; GFX90A:       ; %bb.0:
5977; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5978; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
5979; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
5980; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
5981; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
5982; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
5983; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
5984; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
5985; GFX90A-NEXT:    v_not_b32_e32 v5, v5
5986; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
5987; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
5988; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
5989; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5990; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
5991; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
5992; GFX90A-NEXT:    v_sub_f16_e32 v4, v4, v2
5993; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
5994; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
5995; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
5996; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5997; GFX90A-NEXT:    buffer_wbinvl1
5998; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
5999; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6000; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6001; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
6002; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
6003; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
6004; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
6005; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6006;
6007; GFX908-LABEL: flat_agent_atomic_fsub_ret_f16:
6008; GFX908:       ; %bb.0:
6009; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6010; GFX908-NEXT:    v_mov_b32_e32 v3, v0
6011; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
6012; GFX908-NEXT:    flat_load_dword v4, v[0:1]
6013; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
6014; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6015; GFX908-NEXT:    s_mov_b32 s4, 0xffff
6016; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
6017; GFX908-NEXT:    v_not_b32_e32 v5, v5
6018; GFX908-NEXT:    s_mov_b64 s[4:5], 0
6019; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
6020; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
6021; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6022; GFX908-NEXT:    v_mov_b32_e32 v7, v4
6023; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
6024; GFX908-NEXT:    v_sub_f16_e32 v4, v4, v2
6025; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
6026; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
6027; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
6028; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6029; GFX908-NEXT:    buffer_wbinvl1
6030; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
6031; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6032; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6033; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
6034; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
6035; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
6036; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
6037; GFX908-NEXT:    s_setpc_b64 s[30:31]
6038;
6039; GFX8-LABEL: flat_agent_atomic_fsub_ret_f16:
6040; GFX8:       ; %bb.0:
6041; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6042; GFX8-NEXT:    v_mov_b32_e32 v3, v0
6043; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
6044; GFX8-NEXT:    flat_load_dword v5, v[0:1]
6045; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
6046; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6047; GFX8-NEXT:    s_mov_b32 s4, 0xffff
6048; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
6049; GFX8-NEXT:    v_not_b32_e32 v4, v4
6050; GFX8-NEXT:    s_mov_b64 s[4:5], 0
6051; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
6052; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
6053; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6054; GFX8-NEXT:    v_mov_b32_e32 v6, v5
6055; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6056; GFX8-NEXT:    v_sub_f16_e32 v5, v5, v2
6057; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
6058; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6059; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
6060; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6061; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6062; GFX8-NEXT:    buffer_wbinvl1
6063; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6064; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6065; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6066; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
6067; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
6068; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
6069; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6070; GFX8-NEXT:    s_setpc_b64 s[30:31]
6071;
6072; GFX7-LABEL: flat_agent_atomic_fsub_ret_f16:
6073; GFX7:       ; %bb.0:
6074; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6075; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6076; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
6077; GFX7-NEXT:    flat_load_dword v5, v[0:1]
6078; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
6079; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
6080; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
6081; GFX7-NEXT:    s_mov_b64 s[4:5], 0
6082; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
6083; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
6084; GFX7-NEXT:    v_not_b32_e32 v4, v4
6085; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
6086; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
6087; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6088; GFX7-NEXT:    v_mov_b32_e32 v6, v5
6089; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
6090; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
6091; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
6092; GFX7-NEXT:    v_sub_f32_e32 v5, v5, v3
6093; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
6094; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
6095; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
6096; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6097; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6098; GFX7-NEXT:    buffer_wbinvl1
6099; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6100; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6101; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6102; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
6103; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
6104; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6105; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
6106; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
6107; GFX7-NEXT:    s_setpc_b64 s[30:31]
6108  %result = atomicrmw fsub ptr %ptr, half %val syncscope("agent") seq_cst
6109  ret half %result
6110}
6111
6112define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
6113; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
6114; GFX12:       ; %bb.0:
6115; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6116; GFX12-NEXT:    s_wait_expcnt 0x0
6117; GFX12-NEXT:    s_wait_samplecnt 0x0
6118; GFX12-NEXT:    s_wait_bvhcnt 0x0
6119; GFX12-NEXT:    s_wait_kmcnt 0x0
6120; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
6121; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
6122; GFX12-NEXT:    s_mov_b32 s0, 0
6123; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
6124; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
6125; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
6126; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
6127; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6128; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6129; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6130; GFX12-NEXT:    v_not_b32_e32 v4, v4
6131; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
6132; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
6133; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6134; GFX12-NEXT:    v_mov_b32_e32 v6, v5
6135; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6136; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6137; GFX12-NEXT:    v_sub_f16_e32 v5, v5, v2
6138; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6139; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
6140; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6141; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6142; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
6143; GFX12-NEXT:    s_wait_storecnt 0x0
6144; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
6145; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6146; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6147; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6148; GFX12-NEXT:    s_wait_alu 0xfffe
6149; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
6150; GFX12-NEXT:    s_wait_alu 0xfffe
6151; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
6152; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
6153; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
6154; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6155; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6156; GFX12-NEXT:    s_wait_alu 0xfffe
6157; GFX12-NEXT:    s_setpc_b64 s[30:31]
6158;
6159; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
6160; GFX940:       ; %bb.0:
6161; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6162; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
6163; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
6164; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
6165; GFX940-NEXT:    v_mov_b32_e32 v1, v7
6166; GFX940-NEXT:    flat_load_dword v4, v[0:1]
6167; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
6168; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6169; GFX940-NEXT:    s_mov_b32 s0, 0xffff
6170; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
6171; GFX940-NEXT:    v_not_b32_e32 v5, v5
6172; GFX940-NEXT:    s_mov_b64 s[0:1], 0
6173; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
6174; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
6175; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6176; GFX940-NEXT:    v_mov_b32_e32 v7, v4
6177; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
6178; GFX940-NEXT:    v_sub_f16_e32 v4, v4, v2
6179; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
6180; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
6181; GFX940-NEXT:    buffer_wbl2 sc1
6182; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
6183; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6184; GFX940-NEXT:    buffer_inv sc1
6185; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
6186; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6187; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6188; GFX940-NEXT:    s_cbranch_execnz .LBB23_1
6189; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
6190; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
6191; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
6192; GFX940-NEXT:    s_setpc_b64 s[30:31]
6193;
6194; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
6195; GFX11:       ; %bb.0:
6196; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6197; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
6198; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
6199; GFX11-NEXT:    s_mov_b32 s0, 0
6200; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
6201; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
6202; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
6203; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
6204; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6205; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6206; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6207; GFX11-NEXT:    v_not_b32_e32 v4, v4
6208; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
6209; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
6210; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6211; GFX11-NEXT:    v_mov_b32_e32 v6, v5
6212; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6213; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6214; GFX11-NEXT:    v_sub_f16_e32 v5, v5, v2
6215; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6216; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
6217; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6218; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6219; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
6220; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
6221; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
6222; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6223; GFX11-NEXT:    buffer_gl1_inv
6224; GFX11-NEXT:    buffer_gl0_inv
6225; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6226; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
6227; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6228; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
6229; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
6230; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
6231; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6232; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6233; GFX11-NEXT:    s_setpc_b64 s[30:31]
6234;
6235; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
6236; GFX10:       ; %bb.0:
6237; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6238; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
6239; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
6240; GFX10-NEXT:    s_mov_b32 s4, 0
6241; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
6242; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
6243; GFX10-NEXT:    flat_load_dword v5, v[0:1]
6244; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6245; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6246; GFX10-NEXT:    v_not_b32_e32 v4, v4
6247; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
6248; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
6249; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6250; GFX10-NEXT:    v_mov_b32_e32 v6, v5
6251; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6252; GFX10-NEXT:    v_sub_f16_e32 v5, v5, v2
6253; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
6254; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
6255; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
6256; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6257; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6258; GFX10-NEXT:    buffer_gl1_inv
6259; GFX10-NEXT:    buffer_gl0_inv
6260; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6261; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
6262; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
6263; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
6264; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
6265; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6266; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6267; GFX10-NEXT:    s_setpc_b64 s[30:31]
6268;
6269; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
6270; GFX90A:       ; %bb.0:
6271; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6272; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
6273; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
6274; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
6275; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
6276; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
6277; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6278; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
6279; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
6280; GFX90A-NEXT:    v_not_b32_e32 v5, v5
6281; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
6282; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
6283; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
6284; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6285; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
6286; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
6287; GFX90A-NEXT:    v_sub_f16_e32 v4, v4, v2
6288; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
6289; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
6290; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
6291; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6292; GFX90A-NEXT:    buffer_wbinvl1
6293; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
6294; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6295; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6296; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
6297; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
6298; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
6299; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
6300; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6301;
6302; GFX908-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
6303; GFX908:       ; %bb.0:
6304; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6305; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
6306; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
6307; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
6308; GFX908-NEXT:    flat_load_dword v4, v[0:1]
6309; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
6310; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6311; GFX908-NEXT:    s_mov_b32 s4, 0xffff
6312; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
6313; GFX908-NEXT:    v_not_b32_e32 v5, v5
6314; GFX908-NEXT:    s_mov_b64 s[4:5], 0
6315; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
6316; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
6317; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6318; GFX908-NEXT:    v_mov_b32_e32 v7, v4
6319; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
6320; GFX908-NEXT:    v_sub_f16_e32 v4, v4, v2
6321; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
6322; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
6323; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
6324; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6325; GFX908-NEXT:    buffer_wbinvl1
6326; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
6327; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6328; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6329; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
6330; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
6331; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
6332; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
6333; GFX908-NEXT:    s_setpc_b64 s[30:31]
6334;
6335; GFX8-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
6336; GFX8:       ; %bb.0:
6337; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6338; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
6339; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6340; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
6341; GFX8-NEXT:    flat_load_dword v5, v[0:1]
6342; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
6343; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6344; GFX8-NEXT:    s_mov_b32 s4, 0xffff
6345; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
6346; GFX8-NEXT:    v_not_b32_e32 v4, v4
6347; GFX8-NEXT:    s_mov_b64 s[4:5], 0
6348; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
6349; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
6350; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6351; GFX8-NEXT:    v_mov_b32_e32 v6, v5
6352; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6353; GFX8-NEXT:    v_sub_f16_e32 v5, v5, v2
6354; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
6355; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6356; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
6357; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6358; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6359; GFX8-NEXT:    buffer_wbinvl1
6360; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6361; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6362; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6363; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
6364; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
6365; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
6366; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6367; GFX8-NEXT:    s_setpc_b64 s[30:31]
6368;
6369; GFX7-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
6370; GFX7:       ; %bb.0:
6371; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6372; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
6373; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6374; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
6375; GFX7-NEXT:    flat_load_dword v5, v[0:1]
6376; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
6377; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
6378; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
6379; GFX7-NEXT:    s_mov_b64 s[4:5], 0
6380; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
6381; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
6382; GFX7-NEXT:    v_not_b32_e32 v4, v4
6383; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
6384; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
6385; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6386; GFX7-NEXT:    v_mov_b32_e32 v6, v5
6387; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
6388; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
6389; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
6390; GFX7-NEXT:    v_sub_f32_e32 v5, v5, v3
6391; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
6392; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
6393; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
6394; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6395; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6396; GFX7-NEXT:    buffer_wbinvl1
6397; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6398; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6399; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6400; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
6401; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
6402; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6403; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
6404; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
6405; GFX7-NEXT:    s_setpc_b64 s[30:31]
6406  %gep = getelementptr half, ptr %ptr, i64 1023
6407  %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst
6408  ret half %result
6409}
6410
6411define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) #0 {
6412; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
6413; GFX12:       ; %bb.0:
6414; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6415; GFX12-NEXT:    s_wait_expcnt 0x0
6416; GFX12-NEXT:    s_wait_samplecnt 0x0
6417; GFX12-NEXT:    s_wait_bvhcnt 0x0
6418; GFX12-NEXT:    s_wait_kmcnt 0x0
6419; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
6420; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
6421; GFX12-NEXT:    s_mov_b32 s0, 0
6422; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
6423; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
6424; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
6425; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
6426; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6427; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6428; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6429; GFX12-NEXT:    v_not_b32_e32 v4, v4
6430; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
6431; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
6432; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6433; GFX12-NEXT:    v_mov_b32_e32 v6, v5
6434; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6435; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6436; GFX12-NEXT:    v_sub_f16_e32 v5, v5, v2
6437; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6438; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
6439; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6440; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6441; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
6442; GFX12-NEXT:    s_wait_storecnt 0x0
6443; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
6444; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6445; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6446; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6447; GFX12-NEXT:    s_wait_alu 0xfffe
6448; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
6449; GFX12-NEXT:    s_wait_alu 0xfffe
6450; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
6451; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
6452; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
6453; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6454; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6455; GFX12-NEXT:    s_wait_alu 0xfffe
6456; GFX12-NEXT:    s_setpc_b64 s[30:31]
6457;
6458; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
6459; GFX940:       ; %bb.0:
6460; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6461; GFX940-NEXT:    s_movk_i32 s0, 0xf800
6462; GFX940-NEXT:    s_mov_b32 s1, -1
6463; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
6464; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
6465; GFX940-NEXT:    v_mov_b32_e32 v1, v7
6466; GFX940-NEXT:    flat_load_dword v4, v[0:1]
6467; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
6468; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6469; GFX940-NEXT:    s_mov_b32 s0, 0xffff
6470; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
6471; GFX940-NEXT:    v_not_b32_e32 v5, v5
6472; GFX940-NEXT:    s_mov_b64 s[0:1], 0
6473; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
6474; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
6475; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6476; GFX940-NEXT:    v_mov_b32_e32 v7, v4
6477; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
6478; GFX940-NEXT:    v_sub_f16_e32 v4, v4, v2
6479; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
6480; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
6481; GFX940-NEXT:    buffer_wbl2 sc1
6482; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
6483; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6484; GFX940-NEXT:    buffer_inv sc1
6485; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
6486; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6487; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6488; GFX940-NEXT:    s_cbranch_execnz .LBB24_1
6489; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
6490; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
6491; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
6492; GFX940-NEXT:    s_setpc_b64 s[30:31]
6493;
6494; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
6495; GFX11:       ; %bb.0:
6496; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6497; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
6498; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
6499; GFX11-NEXT:    s_mov_b32 s0, 0
6500; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
6501; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
6502; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
6503; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
6504; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6505; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6506; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6507; GFX11-NEXT:    v_not_b32_e32 v4, v4
6508; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
6509; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
6510; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6511; GFX11-NEXT:    v_mov_b32_e32 v6, v5
6512; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6513; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6514; GFX11-NEXT:    v_sub_f16_e32 v5, v5, v2
6515; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6516; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
6517; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6518; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6519; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
6520; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
6521; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
6522; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6523; GFX11-NEXT:    buffer_gl1_inv
6524; GFX11-NEXT:    buffer_gl0_inv
6525; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6526; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
6527; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6528; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
6529; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
6530; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
6531; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6532; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6533; GFX11-NEXT:    s_setpc_b64 s[30:31]
6534;
6535; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
6536; GFX10:       ; %bb.0:
6537; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6538; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
6539; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
6540; GFX10-NEXT:    s_mov_b32 s4, 0
6541; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
6542; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
6543; GFX10-NEXT:    flat_load_dword v5, v[0:1]
6544; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6545; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6546; GFX10-NEXT:    v_not_b32_e32 v4, v4
6547; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
6548; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
6549; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6550; GFX10-NEXT:    v_mov_b32_e32 v6, v5
6551; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6552; GFX10-NEXT:    v_sub_f16_e32 v5, v5, v2
6553; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
6554; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
6555; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
6556; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6557; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6558; GFX10-NEXT:    buffer_gl1_inv
6559; GFX10-NEXT:    buffer_gl0_inv
6560; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6561; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
6562; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
6563; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
6564; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
6565; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6566; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6567; GFX10-NEXT:    s_setpc_b64 s[30:31]
6568;
6569; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
6570; GFX90A:       ; %bb.0:
6571; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6572; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
6573; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
6574; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
6575; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
6576; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
6577; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6578; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
6579; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
6580; GFX90A-NEXT:    v_not_b32_e32 v5, v5
6581; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
6582; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
6583; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
6584; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6585; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
6586; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
6587; GFX90A-NEXT:    v_sub_f16_e32 v4, v4, v2
6588; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
6589; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
6590; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
6591; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6592; GFX90A-NEXT:    buffer_wbinvl1
6593; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
6594; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6595; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6596; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
6597; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
6598; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
6599; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
6600; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6601;
6602; GFX908-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
6603; GFX908:       ; %bb.0:
6604; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6605; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
6606; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
6607; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
6608; GFX908-NEXT:    flat_load_dword v4, v[0:1]
6609; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
6610; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6611; GFX908-NEXT:    s_mov_b32 s4, 0xffff
6612; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
6613; GFX908-NEXT:    v_not_b32_e32 v5, v5
6614; GFX908-NEXT:    s_mov_b64 s[4:5], 0
6615; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
6616; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
6617; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6618; GFX908-NEXT:    v_mov_b32_e32 v7, v4
6619; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
6620; GFX908-NEXT:    v_sub_f16_e32 v4, v4, v2
6621; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
6622; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
6623; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
6624; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6625; GFX908-NEXT:    buffer_wbinvl1
6626; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
6627; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6628; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6629; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
6630; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
6631; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
6632; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
6633; GFX908-NEXT:    s_setpc_b64 s[30:31]
6634;
6635; GFX8-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
6636; GFX8:       ; %bb.0:
6637; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6638; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
6639; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
6640; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
6641; GFX8-NEXT:    flat_load_dword v5, v[0:1]
6642; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
6643; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6644; GFX8-NEXT:    s_mov_b32 s4, 0xffff
6645; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
6646; GFX8-NEXT:    v_not_b32_e32 v4, v4
6647; GFX8-NEXT:    s_mov_b64 s[4:5], 0
6648; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
6649; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
6650; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6651; GFX8-NEXT:    v_mov_b32_e32 v6, v5
6652; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6653; GFX8-NEXT:    v_sub_f16_e32 v5, v5, v2
6654; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
6655; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6656; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
6657; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6658; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6659; GFX8-NEXT:    buffer_wbinvl1
6660; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6661; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6662; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6663; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
6664; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
6665; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
6666; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6667; GFX8-NEXT:    s_setpc_b64 s[30:31]
6668;
6669; GFX7-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
6670; GFX7:       ; %bb.0:
6671; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6672; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
6673; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
6674; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
6675; GFX7-NEXT:    flat_load_dword v5, v[0:1]
6676; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
6677; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
6678; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
6679; GFX7-NEXT:    s_mov_b64 s[4:5], 0
6680; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
6681; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
6682; GFX7-NEXT:    v_not_b32_e32 v4, v4
6683; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
6684; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
6685; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6686; GFX7-NEXT:    v_mov_b32_e32 v6, v5
6687; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
6688; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
6689; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
6690; GFX7-NEXT:    v_sub_f32_e32 v5, v5, v3
6691; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
6692; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
6693; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
6694; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6695; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6696; GFX7-NEXT:    buffer_wbinvl1
6697; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6698; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6699; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6700; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
6701; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
6702; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6703; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
6704; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
6705; GFX7-NEXT:    s_setpc_b64 s[30:31]
6706  %gep = getelementptr half, ptr %ptr, i64 -1024
6707  %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst
6708  ret half %result
6709 }
6710
6711define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 {
6712; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16:
6713; GFX12:       ; %bb.0:
6714; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6715; GFX12-NEXT:    s_wait_expcnt 0x0
6716; GFX12-NEXT:    s_wait_samplecnt 0x0
6717; GFX12-NEXT:    s_wait_bvhcnt 0x0
6718; GFX12-NEXT:    s_wait_kmcnt 0x0
6719; GFX12-NEXT:    v_mov_b32_e32 v3, v0
6720; GFX12-NEXT:    s_mov_b32 s0, 0
6721; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
6722; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
6723; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
6724; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
6725; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
6726; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
6727; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6728; GFX12-NEXT:    v_not_b32_e32 v6, v3
6729; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
6730; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
6731; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6732; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
6733; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6734; GFX12-NEXT:    v_sub_f16_e32 v3, v3, v2
6735; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
6736; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6737; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
6738; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
6739; GFX12-NEXT:    s_wait_storecnt 0x0
6740; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
6741; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6742; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6743; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
6744; GFX12-NEXT:    v_mov_b32_e32 v4, v3
6745; GFX12-NEXT:    s_wait_alu 0xfffe
6746; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
6747; GFX12-NEXT:    s_wait_alu 0xfffe
6748; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
6749; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
6750; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
6751; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6752; GFX12-NEXT:    s_wait_alu 0xfffe
6753; GFX12-NEXT:    s_setpc_b64 s[30:31]
6754;
6755; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16:
6756; GFX940:       ; %bb.0:
6757; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6758; GFX940-NEXT:    v_mov_b32_e32 v3, v0
6759; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
6760; GFX940-NEXT:    flat_load_dword v5, v[0:1]
6761; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
6762; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6763; GFX940-NEXT:    s_mov_b32 s0, 0xffff
6764; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
6765; GFX940-NEXT:    v_not_b32_e32 v6, v4
6766; GFX940-NEXT:    s_mov_b64 s[0:1], 0
6767; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
6768; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
6769; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6770; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
6771; GFX940-NEXT:    v_sub_f16_e32 v4, v4, v2
6772; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
6773; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
6774; GFX940-NEXT:    buffer_wbl2 sc1
6775; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
6776; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6777; GFX940-NEXT:    buffer_inv sc1
6778; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
6779; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6780; GFX940-NEXT:    v_mov_b32_e32 v5, v4
6781; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6782; GFX940-NEXT:    s_cbranch_execnz .LBB25_1
6783; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
6784; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
6785; GFX940-NEXT:    s_setpc_b64 s[30:31]
6786;
6787; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16:
6788; GFX11:       ; %bb.0:
6789; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6790; GFX11-NEXT:    v_mov_b32_e32 v3, v0
6791; GFX11-NEXT:    s_mov_b32 s0, 0
6792; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
6793; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
6794; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
6795; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
6796; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
6797; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
6798; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6799; GFX11-NEXT:    v_not_b32_e32 v6, v3
6800; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
6801; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
6802; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6803; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
6804; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6805; GFX11-NEXT:    v_sub_f16_e32 v3, v3, v2
6806; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
6807; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6808; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
6809; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
6810; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
6811; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
6812; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6813; GFX11-NEXT:    buffer_gl1_inv
6814; GFX11-NEXT:    buffer_gl0_inv
6815; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
6816; GFX11-NEXT:    v_mov_b32_e32 v4, v3
6817; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
6818; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6819; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
6820; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
6821; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
6822; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6823; GFX11-NEXT:    s_setpc_b64 s[30:31]
6824;
6825; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16:
6826; GFX10:       ; %bb.0:
6827; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6828; GFX10-NEXT:    v_mov_b32_e32 v3, v0
6829; GFX10-NEXT:    s_mov_b32 s4, 0
6830; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
6831; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
6832; GFX10-NEXT:    flat_load_dword v4, v[0:1]
6833; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
6834; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
6835; GFX10-NEXT:    v_not_b32_e32 v6, v3
6836; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
6837; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
6838; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6839; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
6840; GFX10-NEXT:    v_sub_f16_e32 v3, v3, v2
6841; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
6842; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
6843; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
6844; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6845; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6846; GFX10-NEXT:    buffer_gl1_inv
6847; GFX10-NEXT:    buffer_gl0_inv
6848; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
6849; GFX10-NEXT:    v_mov_b32_e32 v4, v3
6850; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
6851; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
6852; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
6853; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
6854; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6855; GFX10-NEXT:    s_setpc_b64 s[30:31]
6856;
6857; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f16:
6858; GFX90A:       ; %bb.0:
6859; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6860; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
6861; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
6862; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
6863; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
6864; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6865; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
6866; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
6867; GFX90A-NEXT:    v_not_b32_e32 v6, v4
6868; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
6869; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
6870; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
6871; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6872; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
6873; GFX90A-NEXT:    v_sub_f16_e32 v4, v4, v2
6874; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
6875; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
6876; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
6877; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6878; GFX90A-NEXT:    buffer_wbinvl1
6879; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
6880; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6881; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
6882; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6883; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
6884; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
6885; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
6886; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6887;
6888; GFX908-LABEL: flat_agent_atomic_fsub_noret_f16:
6889; GFX908:       ; %bb.0:
6890; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6891; GFX908-NEXT:    v_mov_b32_e32 v3, v0
6892; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
6893; GFX908-NEXT:    flat_load_dword v4, v[0:1]
6894; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
6895; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
6896; GFX908-NEXT:    s_mov_b32 s4, 0xffff
6897; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
6898; GFX908-NEXT:    v_not_b32_e32 v6, v3
6899; GFX908-NEXT:    s_mov_b64 s[4:5], 0
6900; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
6901; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
6902; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6903; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
6904; GFX908-NEXT:    v_sub_f16_e32 v3, v3, v2
6905; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
6906; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
6907; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6908; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6909; GFX908-NEXT:    buffer_wbinvl1
6910; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6911; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6912; GFX908-NEXT:    v_mov_b32_e32 v4, v3
6913; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6914; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
6915; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
6916; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
6917; GFX908-NEXT:    s_setpc_b64 s[30:31]
6918;
6919; GFX8-LABEL: flat_agent_atomic_fsub_noret_f16:
6920; GFX8:       ; %bb.0:
6921; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6922; GFX8-NEXT:    v_mov_b32_e32 v3, v0
6923; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
6924; GFX8-NEXT:    flat_load_dword v4, v[0:1]
6925; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
6926; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
6927; GFX8-NEXT:    s_mov_b32 s4, 0xffff
6928; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
6929; GFX8-NEXT:    v_not_b32_e32 v6, v3
6930; GFX8-NEXT:    s_mov_b64 s[4:5], 0
6931; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
6932; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
6933; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6934; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
6935; GFX8-NEXT:    v_sub_f16_e32 v3, v3, v2
6936; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
6937; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
6938; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
6939; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6940; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6941; GFX8-NEXT:    buffer_wbinvl1
6942; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6943; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6944; GFX8-NEXT:    v_mov_b32_e32 v4, v3
6945; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6946; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
6947; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
6948; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
6949; GFX8-NEXT:    s_setpc_b64 s[30:31]
6950;
6951; GFX7-LABEL: flat_agent_atomic_fsub_noret_f16:
6952; GFX7:       ; %bb.0:
6953; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6954; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6955; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
6956; GFX7-NEXT:    flat_load_dword v4, v[0:1]
6957; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
6958; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
6959; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
6960; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
6961; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
6962; GFX7-NEXT:    v_not_b32_e32 v6, v3
6963; GFX7-NEXT:    s_mov_b64 s[4:5], 0
6964; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
6965; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
6966; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6967; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
6968; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
6969; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
6970; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v5
6971; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
6972; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
6973; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
6974; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6975; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6976; GFX7-NEXT:    buffer_wbinvl1
6977; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6978; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6979; GFX7-NEXT:    v_mov_b32_e32 v4, v3
6980; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6981; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
6982; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
6983; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6984; GFX7-NEXT:    s_setpc_b64 s[30:31]
6985  %unused = atomicrmw fsub ptr %ptr, half %val syncscope("agent") seq_cst
6986  ret void
6987}
6988
6989define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
6990; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
6991; GFX12:       ; %bb.0:
6992; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6993; GFX12-NEXT:    s_wait_expcnt 0x0
6994; GFX12-NEXT:    s_wait_samplecnt 0x0
6995; GFX12-NEXT:    s_wait_bvhcnt 0x0
6996; GFX12-NEXT:    s_wait_kmcnt 0x0
6997; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
6998; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
6999; GFX12-NEXT:    s_mov_b32 s0, 0
7000; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
7001; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
7002; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
7003; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
7004; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7005; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
7006; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7007; GFX12-NEXT:    v_not_b32_e32 v6, v3
7008; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
7009; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
7010; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7011; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7012; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7013; GFX12-NEXT:    v_sub_f16_e32 v3, v3, v2
7014; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
7015; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7016; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
7017; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
7018; GFX12-NEXT:    s_wait_storecnt 0x0
7019; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
7020; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7021; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7022; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7023; GFX12-NEXT:    v_mov_b32_e32 v4, v3
7024; GFX12-NEXT:    s_wait_alu 0xfffe
7025; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
7026; GFX12-NEXT:    s_wait_alu 0xfffe
7027; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7028; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
7029; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
7030; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7031; GFX12-NEXT:    s_wait_alu 0xfffe
7032; GFX12-NEXT:    s_setpc_b64 s[30:31]
7033;
7034; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
7035; GFX940:       ; %bb.0:
7036; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7037; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
7038; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
7039; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
7040; GFX940-NEXT:    v_mov_b32_e32 v1, v5
7041; GFX940-NEXT:    flat_load_dword v5, v[0:1]
7042; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
7043; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
7044; GFX940-NEXT:    s_mov_b32 s0, 0xffff
7045; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
7046; GFX940-NEXT:    v_not_b32_e32 v6, v4
7047; GFX940-NEXT:    s_mov_b64 s[0:1], 0
7048; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
7049; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
7050; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7051; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
7052; GFX940-NEXT:    v_sub_f16_e32 v4, v4, v2
7053; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
7054; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
7055; GFX940-NEXT:    buffer_wbl2 sc1
7056; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
7057; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7058; GFX940-NEXT:    buffer_inv sc1
7059; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
7060; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7061; GFX940-NEXT:    v_mov_b32_e32 v5, v4
7062; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7063; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
7064; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
7065; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
7066; GFX940-NEXT:    s_setpc_b64 s[30:31]
7067;
7068; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
7069; GFX11:       ; %bb.0:
7070; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7071; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
7072; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
7073; GFX11-NEXT:    s_mov_b32 s0, 0
7074; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
7075; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
7076; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
7077; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
7078; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7079; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
7080; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7081; GFX11-NEXT:    v_not_b32_e32 v6, v3
7082; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
7083; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7084; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7085; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7086; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7087; GFX11-NEXT:    v_sub_f16_e32 v3, v3, v2
7088; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
7089; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7090; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
7091; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
7092; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7093; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
7094; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7095; GFX11-NEXT:    buffer_gl1_inv
7096; GFX11-NEXT:    buffer_gl0_inv
7097; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7098; GFX11-NEXT:    v_mov_b32_e32 v4, v3
7099; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
7100; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7101; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7102; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
7103; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
7104; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7105; GFX11-NEXT:    s_setpc_b64 s[30:31]
7106;
7107; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
7108; GFX10:       ; %bb.0:
7109; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7110; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
7111; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
7112; GFX10-NEXT:    s_mov_b32 s4, 0
7113; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
7114; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
7115; GFX10-NEXT:    flat_load_dword v4, v[0:1]
7116; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7117; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
7118; GFX10-NEXT:    v_not_b32_e32 v6, v3
7119; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
7120; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7121; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7122; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7123; GFX10-NEXT:    v_sub_f16_e32 v3, v3, v2
7124; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
7125; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
7126; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7127; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7128; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7129; GFX10-NEXT:    buffer_gl1_inv
7130; GFX10-NEXT:    buffer_gl0_inv
7131; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7132; GFX10-NEXT:    v_mov_b32_e32 v4, v3
7133; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
7134; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
7135; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
7136; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
7137; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7138; GFX10-NEXT:    s_setpc_b64 s[30:31]
7139;
7140; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
7141; GFX90A:       ; %bb.0:
7142; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7143; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
7144; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7145; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
7146; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
7147; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
7148; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
7149; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
7150; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
7151; GFX90A-NEXT:    v_not_b32_e32 v6, v4
7152; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
7153; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
7154; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
7155; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7156; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
7157; GFX90A-NEXT:    v_sub_f16_e32 v4, v4, v2
7158; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
7159; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
7160; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
7161; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7162; GFX90A-NEXT:    buffer_wbinvl1
7163; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
7164; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7165; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
7166; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7167; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
7168; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
7169; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7170; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7171;
7172; GFX908-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
7173; GFX908:       ; %bb.0:
7174; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7175; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
7176; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7177; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
7178; GFX908-NEXT:    flat_load_dword v4, v[0:1]
7179; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
7180; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7181; GFX908-NEXT:    s_mov_b32 s4, 0xffff
7182; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
7183; GFX908-NEXT:    v_not_b32_e32 v6, v3
7184; GFX908-NEXT:    s_mov_b64 s[4:5], 0
7185; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
7186; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7187; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7188; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7189; GFX908-NEXT:    v_sub_f16_e32 v3, v3, v2
7190; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
7191; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
7192; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7193; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7194; GFX908-NEXT:    buffer_wbinvl1
7195; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7196; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7197; GFX908-NEXT:    v_mov_b32_e32 v4, v3
7198; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7199; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
7200; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
7201; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7202; GFX908-NEXT:    s_setpc_b64 s[30:31]
7203;
7204; GFX8-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
7205; GFX8:       ; %bb.0:
7206; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7207; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
7208; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7209; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
7210; GFX8-NEXT:    flat_load_dword v4, v[0:1]
7211; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
7212; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7213; GFX8-NEXT:    s_mov_b32 s4, 0xffff
7214; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
7215; GFX8-NEXT:    v_not_b32_e32 v6, v3
7216; GFX8-NEXT:    s_mov_b64 s[4:5], 0
7217; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
7218; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
7219; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7220; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7221; GFX8-NEXT:    v_sub_f16_e32 v3, v3, v2
7222; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
7223; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
7224; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
7225; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7226; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7227; GFX8-NEXT:    buffer_wbinvl1
7228; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7229; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7230; GFX8-NEXT:    v_mov_b32_e32 v4, v3
7231; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7232; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
7233; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
7234; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7235; GFX8-NEXT:    s_setpc_b64 s[30:31]
7236;
7237; GFX7-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
7238; GFX7:       ; %bb.0:
7239; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7240; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
7241; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7242; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
7243; GFX7-NEXT:    flat_load_dword v3, v[0:1]
7244; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
7245; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
7246; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7247; GFX7-NEXT:    s_mov_b64 s[4:5], 0
7248; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
7249; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
7250; GFX7-NEXT:    v_not_b32_e32 v6, v2
7251; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
7252; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
7253; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7254; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7255; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
7256; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
7257; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v5
7258; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
7259; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7260; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
7261; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7262; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7263; GFX7-NEXT:    buffer_wbinvl1
7264; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7265; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7266; GFX7-NEXT:    v_mov_b32_e32 v3, v2
7267; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7268; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
7269; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
7270; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7271; GFX7-NEXT:    s_setpc_b64 s[30:31]
7272  %gep = getelementptr half, ptr %ptr, i64 1023
7273  %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst
7274  ret void
7275}
7276
7277define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val) #0 {
7278; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
7279; GFX12:       ; %bb.0:
7280; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7281; GFX12-NEXT:    s_wait_expcnt 0x0
7282; GFX12-NEXT:    s_wait_samplecnt 0x0
7283; GFX12-NEXT:    s_wait_bvhcnt 0x0
7284; GFX12-NEXT:    s_wait_kmcnt 0x0
7285; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
7286; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
7287; GFX12-NEXT:    s_mov_b32 s0, 0
7288; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
7289; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
7290; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
7291; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
7292; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7293; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
7294; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7295; GFX12-NEXT:    v_not_b32_e32 v6, v3
7296; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
7297; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
7298; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7299; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7300; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7301; GFX12-NEXT:    v_sub_f16_e32 v3, v3, v2
7302; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
7303; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7304; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
7305; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
7306; GFX12-NEXT:    s_wait_storecnt 0x0
7307; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
7308; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7309; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7310; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7311; GFX12-NEXT:    v_mov_b32_e32 v4, v3
7312; GFX12-NEXT:    s_wait_alu 0xfffe
7313; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
7314; GFX12-NEXT:    s_wait_alu 0xfffe
7315; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7316; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
7317; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
7318; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7319; GFX12-NEXT:    s_wait_alu 0xfffe
7320; GFX12-NEXT:    s_setpc_b64 s[30:31]
7321;
7322; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
7323; GFX940:       ; %bb.0:
7324; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7325; GFX940-NEXT:    s_movk_i32 s0, 0xf800
7326; GFX940-NEXT:    s_mov_b32 s1, -1
7327; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
7328; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
7329; GFX940-NEXT:    v_mov_b32_e32 v1, v5
7330; GFX940-NEXT:    flat_load_dword v5, v[0:1]
7331; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
7332; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
7333; GFX940-NEXT:    s_mov_b32 s0, 0xffff
7334; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
7335; GFX940-NEXT:    v_not_b32_e32 v6, v4
7336; GFX940-NEXT:    s_mov_b64 s[0:1], 0
7337; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
7338; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
7339; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7340; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
7341; GFX940-NEXT:    v_sub_f16_e32 v4, v4, v2
7342; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
7343; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
7344; GFX940-NEXT:    buffer_wbl2 sc1
7345; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
7346; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7347; GFX940-NEXT:    buffer_inv sc1
7348; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
7349; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7350; GFX940-NEXT:    v_mov_b32_e32 v5, v4
7351; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7352; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
7353; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
7354; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
7355; GFX940-NEXT:    s_setpc_b64 s[30:31]
7356;
7357; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
7358; GFX11:       ; %bb.0:
7359; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7360; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
7361; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
7362; GFX11-NEXT:    s_mov_b32 s0, 0
7363; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
7364; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
7365; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
7366; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
7367; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7368; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
7369; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7370; GFX11-NEXT:    v_not_b32_e32 v6, v3
7371; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
7372; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7373; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7374; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7375; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7376; GFX11-NEXT:    v_sub_f16_e32 v3, v3, v2
7377; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
7378; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7379; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
7380; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
7381; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7382; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
7383; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7384; GFX11-NEXT:    buffer_gl1_inv
7385; GFX11-NEXT:    buffer_gl0_inv
7386; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7387; GFX11-NEXT:    v_mov_b32_e32 v4, v3
7388; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
7389; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7390; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7391; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
7392; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
7393; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7394; GFX11-NEXT:    s_setpc_b64 s[30:31]
7395;
7396; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
7397; GFX10:       ; %bb.0:
7398; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7399; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
7400; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
7401; GFX10-NEXT:    s_mov_b32 s4, 0
7402; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
7403; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
7404; GFX10-NEXT:    flat_load_dword v4, v[0:1]
7405; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7406; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
7407; GFX10-NEXT:    v_not_b32_e32 v6, v3
7408; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
7409; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7410; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7411; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7412; GFX10-NEXT:    v_sub_f16_e32 v3, v3, v2
7413; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
7414; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
7415; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7416; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7417; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7418; GFX10-NEXT:    buffer_gl1_inv
7419; GFX10-NEXT:    buffer_gl0_inv
7420; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7421; GFX10-NEXT:    v_mov_b32_e32 v4, v3
7422; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
7423; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
7424; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
7425; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
7426; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7427; GFX10-NEXT:    s_setpc_b64 s[30:31]
7428;
7429; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
7430; GFX90A:       ; %bb.0:
7431; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7432; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
7433; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
7434; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
7435; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
7436; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
7437; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
7438; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
7439; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
7440; GFX90A-NEXT:    v_not_b32_e32 v6, v4
7441; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
7442; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
7443; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
7444; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7445; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
7446; GFX90A-NEXT:    v_sub_f16_e32 v4, v4, v2
7447; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
7448; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
7449; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
7450; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7451; GFX90A-NEXT:    buffer_wbinvl1
7452; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
7453; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7454; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
7455; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7456; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
7457; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
7458; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7459; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7460;
7461; GFX908-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
7462; GFX908:       ; %bb.0:
7463; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7464; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
7465; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
7466; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
7467; GFX908-NEXT:    flat_load_dword v4, v[0:1]
7468; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
7469; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7470; GFX908-NEXT:    s_mov_b32 s4, 0xffff
7471; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
7472; GFX908-NEXT:    v_not_b32_e32 v6, v3
7473; GFX908-NEXT:    s_mov_b64 s[4:5], 0
7474; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
7475; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7476; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7477; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7478; GFX908-NEXT:    v_sub_f16_e32 v3, v3, v2
7479; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
7480; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
7481; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7482; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7483; GFX908-NEXT:    buffer_wbinvl1
7484; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7485; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7486; GFX908-NEXT:    v_mov_b32_e32 v4, v3
7487; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7488; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
7489; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
7490; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7491; GFX908-NEXT:    s_setpc_b64 s[30:31]
7492;
7493; GFX8-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
7494; GFX8:       ; %bb.0:
7495; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7496; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
7497; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
7498; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
7499; GFX8-NEXT:    flat_load_dword v4, v[0:1]
7500; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
7501; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7502; GFX8-NEXT:    s_mov_b32 s4, 0xffff
7503; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
7504; GFX8-NEXT:    v_not_b32_e32 v6, v3
7505; GFX8-NEXT:    s_mov_b64 s[4:5], 0
7506; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
7507; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
7508; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7509; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7510; GFX8-NEXT:    v_sub_f16_e32 v3, v3, v2
7511; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
7512; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
7513; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
7514; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7515; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7516; GFX8-NEXT:    buffer_wbinvl1
7517; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7518; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7519; GFX8-NEXT:    v_mov_b32_e32 v4, v3
7520; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7521; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
7522; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
7523; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7524; GFX8-NEXT:    s_setpc_b64 s[30:31]
7525;
7526; GFX7-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
7527; GFX7:       ; %bb.0:
7528; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7529; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
7530; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
7531; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
7532; GFX7-NEXT:    flat_load_dword v3, v[0:1]
7533; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
7534; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
7535; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7536; GFX7-NEXT:    s_mov_b64 s[4:5], 0
7537; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
7538; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
7539; GFX7-NEXT:    v_not_b32_e32 v6, v2
7540; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
7541; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
7542; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7543; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7544; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
7545; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
7546; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v5
7547; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
7548; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7549; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
7550; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7551; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7552; GFX7-NEXT:    buffer_wbinvl1
7553; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7554; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7555; GFX7-NEXT:    v_mov_b32_e32 v3, v2
7556; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7557; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
7558; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
7559; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7560; GFX7-NEXT:    s_setpc_b64 s[30:31]
7561  %gep = getelementptr half, ptr %ptr, i64 -1024
7562  %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst
7563  ret void
7564}
7565
7566define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, half %val) #0 {
7567; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
7568; GFX12:       ; %bb.0:
7569; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7570; GFX12-NEXT:    s_wait_expcnt 0x0
7571; GFX12-NEXT:    s_wait_samplecnt 0x0
7572; GFX12-NEXT:    s_wait_bvhcnt 0x0
7573; GFX12-NEXT:    s_wait_kmcnt 0x0
7574; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
7575; GFX12-NEXT:    s_mov_b32 s0, 0
7576; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
7577; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
7578; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7579; GFX12-NEXT:    v_mov_b32_e32 v4, v3
7580; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7581; GFX12-NEXT:    v_sub_f16_e32 v3, v4, v2
7582; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
7583; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7584; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
7585; GFX12-NEXT:    s_wait_storecnt 0x0
7586; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
7587; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7588; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7589; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7590; GFX12-NEXT:    s_wait_alu 0xfffe
7591; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
7592; GFX12-NEXT:    s_wait_alu 0xfffe
7593; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7594; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
7595; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
7596; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7597; GFX12-NEXT:    v_mov_b32_e32 v0, v3
7598; GFX12-NEXT:    s_wait_alu 0xfffe
7599; GFX12-NEXT:    s_setpc_b64 s[30:31]
7600;
7601; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
7602; GFX940:       ; %bb.0:
7603; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7604; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
7605; GFX940-NEXT:    s_mov_b64 s[0:1], 0
7606; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
7607; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
7608; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
7609; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7610; GFX940-NEXT:    v_mov_b32_e32 v5, v3
7611; GFX940-NEXT:    v_sub_f16_e32 v3, v5, v2
7612; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
7613; GFX940-NEXT:    buffer_wbl2 sc1
7614; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
7615; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7616; GFX940-NEXT:    buffer_inv sc1
7617; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
7618; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7619; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7620; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
7621; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
7622; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
7623; GFX940-NEXT:    v_mov_b32_e32 v0, v3
7624; GFX940-NEXT:    s_setpc_b64 s[30:31]
7625;
7626; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
7627; GFX11:       ; %bb.0:
7628; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7629; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
7630; GFX11-NEXT:    s_mov_b32 s0, 0
7631; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
7632; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7633; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7634; GFX11-NEXT:    v_mov_b32_e32 v4, v3
7635; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7636; GFX11-NEXT:    v_sub_f16_e32 v3, v4, v2
7637; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
7638; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7639; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
7640; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7641; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
7642; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7643; GFX11-NEXT:    buffer_gl1_inv
7644; GFX11-NEXT:    buffer_gl0_inv
7645; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7646; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
7647; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7648; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7649; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
7650; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
7651; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7652; GFX11-NEXT:    v_mov_b32_e32 v0, v3
7653; GFX11-NEXT:    s_setpc_b64 s[30:31]
7654;
7655; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
7656; GFX10:       ; %bb.0:
7657; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7658; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
7659; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
7660; GFX10-NEXT:    s_mov_b32 s4, 0
7661; GFX10-NEXT:    flat_load_dword v0, v[3:4]
7662; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
7663; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7664; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7665; GFX10-NEXT:    v_mov_b32_e32 v1, v0
7666; GFX10-NEXT:    v_sub_f16_e32 v0, v1, v2
7667; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
7668; GFX10-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
7669; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7670; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
7671; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7672; GFX10-NEXT:    buffer_gl1_inv
7673; GFX10-NEXT:    buffer_gl0_inv
7674; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
7675; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
7676; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
7677; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
7678; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
7679; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7680; GFX10-NEXT:    s_setpc_b64 s[30:31]
7681;
7682; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
7683; GFX90A:       ; %bb.0:
7684; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7685; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
7686; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
7687; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
7688; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
7689; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
7690; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7691; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
7692; GFX90A-NEXT:    v_sub_f16_e32 v3, v5, v2
7693; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
7694; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
7695; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7696; GFX90A-NEXT:    buffer_wbinvl1
7697; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
7698; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7699; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7700; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
7701; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
7702; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7703; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
7704; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7705;
7706; GFX908-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
7707; GFX908:       ; %bb.0:
7708; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7709; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
7710; GFX908-NEXT:    s_mov_b64 s[4:5], 0
7711; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
7712; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
7713; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7714; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7715; GFX908-NEXT:    v_mov_b32_e32 v4, v3
7716; GFX908-NEXT:    v_sub_f16_e32 v3, v4, v2
7717; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
7718; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
7719; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7720; GFX908-NEXT:    buffer_wbinvl1
7721; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7722; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7723; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7724; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
7725; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
7726; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7727; GFX908-NEXT:    v_mov_b32_e32 v0, v3
7728; GFX908-NEXT:    s_setpc_b64 s[30:31]
7729;
7730; GFX8-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
7731; GFX8:       ; %bb.0:
7732; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7733; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
7734; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
7735; GFX8-NEXT:    flat_load_dword v0, v[3:4]
7736; GFX8-NEXT:    s_mov_b64 s[4:5], 0
7737; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
7738; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
7739; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7740; GFX8-NEXT:    v_mov_b32_e32 v1, v0
7741; GFX8-NEXT:    v_sub_f16_e32 v0, v1, v2
7742; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
7743; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
7744; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
7745; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7746; GFX8-NEXT:    buffer_wbinvl1
7747; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
7748; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7749; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7750; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
7751; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
7752; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7753; GFX8-NEXT:    s_setpc_b64 s[30:31]
7754;
7755; GFX7-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
7756; GFX7:       ; %bb.0:
7757; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7758; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
7759; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7760; GFX7-NEXT:    flat_load_dword v3, v[0:1]
7761; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
7762; GFX7-NEXT:    s_mov_b64 s[4:5], 0
7763; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
7764; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
7765; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
7766; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7767; GFX7-NEXT:    v_mov_b32_e32 v4, v3
7768; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
7769; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
7770; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
7771; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
7772; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
7773; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7774; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7775; GFX7-NEXT:    buffer_wbinvl1
7776; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7777; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7778; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7779; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
7780; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
7781; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7782; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
7783; GFX7-NEXT:    s_setpc_b64 s[30:31]
7784  %gep = getelementptr half, ptr %ptr, i64 1023
7785  %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst, align 4
7786  ret half %result
7787}
7788
7789define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, half %val) #0 {
7790; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
7791; GFX12:       ; %bb.0:
7792; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7793; GFX12-NEXT:    s_wait_expcnt 0x0
7794; GFX12-NEXT:    s_wait_samplecnt 0x0
7795; GFX12-NEXT:    s_wait_bvhcnt 0x0
7796; GFX12-NEXT:    s_wait_kmcnt 0x0
7797; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
7798; GFX12-NEXT:    s_mov_b32 s0, 0
7799; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
7800; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
7801; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7802; GFX12-NEXT:    v_sub_f16_e32 v3, v4, v2
7803; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7804; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
7805; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
7806; GFX12-NEXT:    s_wait_storecnt 0x0
7807; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
7808; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7809; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7810; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7811; GFX12-NEXT:    v_mov_b32_e32 v4, v3
7812; GFX12-NEXT:    s_wait_alu 0xfffe
7813; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
7814; GFX12-NEXT:    s_wait_alu 0xfffe
7815; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7816; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
7817; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
7818; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7819; GFX12-NEXT:    s_wait_alu 0xfffe
7820; GFX12-NEXT:    s_setpc_b64 s[30:31]
7821;
7822; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
7823; GFX940:       ; %bb.0:
7824; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7825; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2046
7826; GFX940-NEXT:    s_mov_b64 s[0:1], 0
7827; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
7828; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
7829; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
7830; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7831; GFX940-NEXT:    v_sub_f16_e32 v3, v5, v2
7832; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
7833; GFX940-NEXT:    buffer_wbl2 sc1
7834; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
7835; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7836; GFX940-NEXT:    buffer_inv sc1
7837; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
7838; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7839; GFX940-NEXT:    v_mov_b32_e32 v5, v3
7840; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7841; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
7842; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
7843; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
7844; GFX940-NEXT:    s_setpc_b64 s[30:31]
7845;
7846; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
7847; GFX11:       ; %bb.0:
7848; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7849; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
7850; GFX11-NEXT:    s_mov_b32 s0, 0
7851; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
7852; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7853; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7854; GFX11-NEXT:    v_sub_f16_e32 v3, v4, v2
7855; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7856; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
7857; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
7858; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7859; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
7860; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7861; GFX11-NEXT:    buffer_gl1_inv
7862; GFX11-NEXT:    buffer_gl0_inv
7863; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7864; GFX11-NEXT:    v_mov_b32_e32 v4, v3
7865; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
7866; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7867; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7868; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
7869; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
7870; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7871; GFX11-NEXT:    s_setpc_b64 s[30:31]
7872;
7873; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
7874; GFX10:       ; %bb.0:
7875; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7876; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
7877; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
7878; GFX10-NEXT:    s_mov_b32 s4, 0
7879; GFX10-NEXT:    flat_load_dword v4, v[0:1]
7880; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
7881; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7882; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7883; GFX10-NEXT:    v_sub_f16_e32 v3, v4, v2
7884; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
7885; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
7886; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7887; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7888; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7889; GFX10-NEXT:    buffer_gl1_inv
7890; GFX10-NEXT:    buffer_gl0_inv
7891; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7892; GFX10-NEXT:    v_mov_b32_e32 v4, v3
7893; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
7894; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
7895; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
7896; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
7897; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7898; GFX10-NEXT:    s_setpc_b64 s[30:31]
7899;
7900; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
7901; GFX90A:       ; %bb.0:
7902; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7903; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2046
7904; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
7905; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
7906; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
7907; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
7908; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7909; GFX90A-NEXT:    v_sub_f16_e32 v3, v5, v2
7910; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
7911; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
7912; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7913; GFX90A-NEXT:    buffer_wbinvl1
7914; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
7915; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7916; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
7917; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7918; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
7919; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
7920; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7921; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7922;
7923; GFX908-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
7924; GFX908:       ; %bb.0:
7925; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7926; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2046
7927; GFX908-NEXT:    s_mov_b64 s[4:5], 0
7928; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
7929; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
7930; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7931; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7932; GFX908-NEXT:    v_sub_f16_e32 v3, v4, v2
7933; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
7934; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
7935; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7936; GFX908-NEXT:    buffer_wbinvl1
7937; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7938; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7939; GFX908-NEXT:    v_mov_b32_e32 v4, v3
7940; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7941; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
7942; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
7943; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7944; GFX908-NEXT:    s_setpc_b64 s[30:31]
7945;
7946; GFX8-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
7947; GFX8:       ; %bb.0:
7948; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7949; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
7950; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7951; GFX8-NEXT:    flat_load_dword v4, v[0:1]
7952; GFX8-NEXT:    s_mov_b64 s[4:5], 0
7953; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
7954; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
7955; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7956; GFX8-NEXT:    v_sub_f16_e32 v3, v4, v2
7957; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
7958; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
7959; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7960; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7961; GFX8-NEXT:    buffer_wbinvl1
7962; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7963; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7964; GFX8-NEXT:    v_mov_b32_e32 v4, v3
7965; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7966; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
7967; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
7968; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7969; GFX8-NEXT:    s_setpc_b64 s[30:31]
7970;
7971; GFX7-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
7972; GFX7:       ; %bb.0:
7973; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7974; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
7975; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7976; GFX7-NEXT:    flat_load_dword v3, v[0:1]
7977; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
7978; GFX7-NEXT:    s_mov_b64 s[4:5], 0
7979; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
7980; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
7981; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
7982; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7983; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
7984; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
7985; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v4
7986; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
7987; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
7988; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7989; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7990; GFX7-NEXT:    buffer_wbinvl1
7991; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7992; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7993; GFX7-NEXT:    v_mov_b32_e32 v3, v2
7994; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7995; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
7996; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
7997; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7998; GFX7-NEXT:    s_setpc_b64 s[30:31]
7999  %gep = getelementptr half, ptr %ptr, i64 1023
8000  %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst, align 4
8001  ret void
8002}
8003
8004define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
8005; GFX12-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
8006; GFX12:       ; %bb.0:
8007; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8008; GFX12-NEXT:    s_wait_expcnt 0x0
8009; GFX12-NEXT:    s_wait_samplecnt 0x0
8010; GFX12-NEXT:    s_wait_bvhcnt 0x0
8011; GFX12-NEXT:    s_wait_kmcnt 0x0
8012; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
8013; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8014; GFX12-NEXT:    s_mov_b32 s0, 0
8015; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
8016; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
8017; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
8018; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
8019; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8020; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8021; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8022; GFX12-NEXT:    v_not_b32_e32 v4, v4
8023; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
8024; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
8025; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8026; GFX12-NEXT:    v_mov_b32_e32 v6, v5
8027; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8028; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8029; GFX12-NEXT:    v_sub_f16_e32 v5, v5, v2
8030; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8031; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
8032; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8033; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8034; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
8035; GFX12-NEXT:    global_wb scope:SCOPE_SYS
8036; GFX12-NEXT:    s_wait_storecnt 0x0
8037; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
8038; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8039; GFX12-NEXT:    global_inv scope:SCOPE_SYS
8040; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8041; GFX12-NEXT:    s_wait_alu 0xfffe
8042; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
8043; GFX12-NEXT:    s_wait_alu 0xfffe
8044; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8045; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
8046; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
8047; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8048; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8049; GFX12-NEXT:    s_wait_alu 0xfffe
8050; GFX12-NEXT:    s_setpc_b64 s[30:31]
8051;
8052; GFX940-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
8053; GFX940:       ; %bb.0:
8054; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8055; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
8056; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
8057; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
8058; GFX940-NEXT:    v_mov_b32_e32 v1, v7
8059; GFX940-NEXT:    flat_load_dword v4, v[0:1]
8060; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
8061; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8062; GFX940-NEXT:    s_mov_b32 s0, 0xffff
8063; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
8064; GFX940-NEXT:    v_not_b32_e32 v5, v5
8065; GFX940-NEXT:    s_mov_b64 s[0:1], 0
8066; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
8067; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
8068; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8069; GFX940-NEXT:    v_mov_b32_e32 v7, v4
8070; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
8071; GFX940-NEXT:    v_sub_f16_e32 v4, v4, v2
8072; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8073; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
8074; GFX940-NEXT:    buffer_wbl2 sc0 sc1
8075; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1
8076; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8077; GFX940-NEXT:    buffer_inv sc0 sc1
8078; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
8079; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8080; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8081; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
8082; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
8083; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
8084; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
8085; GFX940-NEXT:    s_setpc_b64 s[30:31]
8086;
8087; GFX11-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
8088; GFX11:       ; %bb.0:
8089; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8090; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
8091; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8092; GFX11-NEXT:    s_mov_b32 s0, 0
8093; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
8094; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
8095; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
8096; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
8097; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8098; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8099; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8100; GFX11-NEXT:    v_not_b32_e32 v4, v4
8101; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
8102; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
8103; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8104; GFX11-NEXT:    v_mov_b32_e32 v6, v5
8105; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8106; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8107; GFX11-NEXT:    v_sub_f16_e32 v5, v5, v2
8108; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8109; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
8110; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8111; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8112; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
8113; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
8114; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
8115; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8116; GFX11-NEXT:    buffer_gl1_inv
8117; GFX11-NEXT:    buffer_gl0_inv
8118; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8119; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
8120; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8121; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8122; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
8123; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
8124; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8125; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8126; GFX11-NEXT:    s_setpc_b64 s[30:31]
8127;
8128; GFX10-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
8129; GFX10:       ; %bb.0:
8130; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8131; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
8132; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8133; GFX10-NEXT:    s_mov_b32 s4, 0
8134; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
8135; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
8136; GFX10-NEXT:    flat_load_dword v5, v[0:1]
8137; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8138; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8139; GFX10-NEXT:    v_not_b32_e32 v4, v4
8140; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
8141; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
8142; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8143; GFX10-NEXT:    v_mov_b32_e32 v6, v5
8144; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8145; GFX10-NEXT:    v_sub_f16_e32 v5, v5, v2
8146; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
8147; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
8148; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
8149; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8150; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8151; GFX10-NEXT:    buffer_gl1_inv
8152; GFX10-NEXT:    buffer_gl0_inv
8153; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8154; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
8155; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
8156; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
8157; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
8158; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
8159; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8160; GFX10-NEXT:    s_setpc_b64 s[30:31]
8161;
8162; GFX90A-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
8163; GFX90A:       ; %bb.0:
8164; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8165; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
8166; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8167; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
8168; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
8169; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
8170; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8171; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
8172; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
8173; GFX90A-NEXT:    v_not_b32_e32 v5, v5
8174; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
8175; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
8176; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
8177; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8178; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
8179; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
8180; GFX90A-NEXT:    v_sub_f16_e32 v4, v4, v2
8181; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8182; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
8183; GFX90A-NEXT:    buffer_wbl2
8184; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
8185; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8186; GFX90A-NEXT:    buffer_invl2
8187; GFX90A-NEXT:    buffer_wbinvl1
8188; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
8189; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8190; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8191; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
8192; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
8193; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
8194; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
8195; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8196;
8197; GFX908-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
8198; GFX908:       ; %bb.0:
8199; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8200; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
8201; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8202; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
8203; GFX908-NEXT:    flat_load_dword v4, v[0:1]
8204; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
8205; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8206; GFX908-NEXT:    s_mov_b32 s4, 0xffff
8207; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
8208; GFX908-NEXT:    v_not_b32_e32 v5, v5
8209; GFX908-NEXT:    s_mov_b64 s[4:5], 0
8210; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
8211; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
8212; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8213; GFX908-NEXT:    v_mov_b32_e32 v7, v4
8214; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
8215; GFX908-NEXT:    v_sub_f16_e32 v4, v4, v2
8216; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8217; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
8218; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
8219; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8220; GFX908-NEXT:    buffer_wbinvl1
8221; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
8222; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8223; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8224; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
8225; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
8226; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
8227; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
8228; GFX908-NEXT:    s_setpc_b64 s[30:31]
8229;
8230; GFX8-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
8231; GFX8:       ; %bb.0:
8232; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8233; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
8234; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8235; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
8236; GFX8-NEXT:    flat_load_dword v5, v[0:1]
8237; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
8238; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8239; GFX8-NEXT:    s_mov_b32 s4, 0xffff
8240; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
8241; GFX8-NEXT:    v_not_b32_e32 v4, v4
8242; GFX8-NEXT:    s_mov_b64 s[4:5], 0
8243; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
8244; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8245; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8246; GFX8-NEXT:    v_mov_b32_e32 v6, v5
8247; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8248; GFX8-NEXT:    v_sub_f16_e32 v5, v5, v2
8249; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
8250; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8251; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
8252; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8253; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8254; GFX8-NEXT:    buffer_wbinvl1
8255; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8256; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8257; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8258; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
8259; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8260; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8261; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8262; GFX8-NEXT:    s_setpc_b64 s[30:31]
8263;
8264; GFX7-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
8265; GFX7:       ; %bb.0:
8266; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8267; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
8268; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8269; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
8270; GFX7-NEXT:    flat_load_dword v5, v[0:1]
8271; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
8272; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
8273; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
8274; GFX7-NEXT:    s_mov_b64 s[4:5], 0
8275; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
8276; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
8277; GFX7-NEXT:    v_not_b32_e32 v4, v4
8278; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
8279; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8280; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8281; GFX7-NEXT:    v_mov_b32_e32 v6, v5
8282; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
8283; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
8284; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
8285; GFX7-NEXT:    v_sub_f32_e32 v5, v5, v3
8286; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
8287; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
8288; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
8289; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8290; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8291; GFX7-NEXT:    buffer_wbinvl1
8292; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8293; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8294; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8295; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
8296; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
8297; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8298; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
8299; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
8300; GFX7-NEXT:    s_setpc_b64 s[30:31]
8301  %gep = getelementptr half, ptr %ptr, i64 1023
8302  %result = atomicrmw fsub ptr %gep, half %val seq_cst
8303  ret half %result
8304}
8305
8306define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
8307; GFX12-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
8308; GFX12:       ; %bb.0:
8309; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8310; GFX12-NEXT:    s_wait_expcnt 0x0
8311; GFX12-NEXT:    s_wait_samplecnt 0x0
8312; GFX12-NEXT:    s_wait_bvhcnt 0x0
8313; GFX12-NEXT:    s_wait_kmcnt 0x0
8314; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
8315; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8316; GFX12-NEXT:    s_mov_b32 s0, 0
8317; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
8318; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
8319; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
8320; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
8321; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
8322; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
8323; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8324; GFX12-NEXT:    v_not_b32_e32 v6, v3
8325; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
8326; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
8327; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8328; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
8329; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8330; GFX12-NEXT:    v_sub_f16_e32 v3, v3, v2
8331; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
8332; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8333; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
8334; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
8335; GFX12-NEXT:    global_wb scope:SCOPE_SYS
8336; GFX12-NEXT:    s_wait_storecnt 0x0
8337; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
8338; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8339; GFX12-NEXT:    global_inv scope:SCOPE_SYS
8340; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
8341; GFX12-NEXT:    v_mov_b32_e32 v4, v3
8342; GFX12-NEXT:    s_wait_alu 0xfffe
8343; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
8344; GFX12-NEXT:    s_wait_alu 0xfffe
8345; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8346; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
8347; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
8348; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8349; GFX12-NEXT:    s_wait_alu 0xfffe
8350; GFX12-NEXT:    s_setpc_b64 s[30:31]
8351;
8352; GFX940-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
8353; GFX940:       ; %bb.0:
8354; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8355; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
8356; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
8357; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
8358; GFX940-NEXT:    v_mov_b32_e32 v1, v5
8359; GFX940-NEXT:    flat_load_dword v5, v[0:1]
8360; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
8361; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8362; GFX940-NEXT:    s_mov_b32 s0, 0xffff
8363; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
8364; GFX940-NEXT:    v_not_b32_e32 v6, v4
8365; GFX940-NEXT:    s_mov_b64 s[0:1], 0
8366; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
8367; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
8368; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8369; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
8370; GFX940-NEXT:    v_sub_f16_e32 v4, v4, v2
8371; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8372; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
8373; GFX940-NEXT:    buffer_wbl2 sc0 sc1
8374; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1
8375; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8376; GFX940-NEXT:    buffer_inv sc0 sc1
8377; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
8378; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8379; GFX940-NEXT:    v_mov_b32_e32 v5, v4
8380; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8381; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
8382; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
8383; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
8384; GFX940-NEXT:    s_setpc_b64 s[30:31]
8385;
8386; GFX11-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
8387; GFX11:       ; %bb.0:
8388; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8389; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
8390; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8391; GFX11-NEXT:    s_mov_b32 s0, 0
8392; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
8393; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
8394; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
8395; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
8396; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
8397; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
8398; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8399; GFX11-NEXT:    v_not_b32_e32 v6, v3
8400; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
8401; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
8402; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8403; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
8404; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8405; GFX11-NEXT:    v_sub_f16_e32 v3, v3, v2
8406; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
8407; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8408; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
8409; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
8410; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
8411; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
8412; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8413; GFX11-NEXT:    buffer_gl1_inv
8414; GFX11-NEXT:    buffer_gl0_inv
8415; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
8416; GFX11-NEXT:    v_mov_b32_e32 v4, v3
8417; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
8418; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8419; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8420; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
8421; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
8422; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8423; GFX11-NEXT:    s_setpc_b64 s[30:31]
8424;
8425; GFX10-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
8426; GFX10:       ; %bb.0:
8427; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8428; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
8429; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8430; GFX10-NEXT:    s_mov_b32 s4, 0
8431; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
8432; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
8433; GFX10-NEXT:    flat_load_dword v4, v[0:1]
8434; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
8435; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
8436; GFX10-NEXT:    v_not_b32_e32 v6, v3
8437; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
8438; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
8439; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8440; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
8441; GFX10-NEXT:    v_sub_f16_e32 v3, v3, v2
8442; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
8443; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
8444; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
8445; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
8446; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8447; GFX10-NEXT:    buffer_gl1_inv
8448; GFX10-NEXT:    buffer_gl0_inv
8449; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
8450; GFX10-NEXT:    v_mov_b32_e32 v4, v3
8451; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
8452; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
8453; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
8454; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
8455; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
8456; GFX10-NEXT:    s_setpc_b64 s[30:31]
8457;
8458; GFX90A-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
8459; GFX90A:       ; %bb.0:
8460; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8461; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
8462; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8463; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
8464; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
8465; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
8466; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8467; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
8468; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
8469; GFX90A-NEXT:    v_not_b32_e32 v6, v4
8470; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
8471; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
8472; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
8473; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8474; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
8475; GFX90A-NEXT:    v_sub_f16_e32 v4, v4, v2
8476; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8477; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
8478; GFX90A-NEXT:    buffer_wbl2
8479; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
8480; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8481; GFX90A-NEXT:    buffer_invl2
8482; GFX90A-NEXT:    buffer_wbinvl1
8483; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
8484; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8485; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
8486; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8487; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
8488; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
8489; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
8490; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8491;
8492; GFX908-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
8493; GFX908:       ; %bb.0:
8494; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8495; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
8496; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8497; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
8498; GFX908-NEXT:    flat_load_dword v4, v[0:1]
8499; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
8500; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
8501; GFX908-NEXT:    s_mov_b32 s4, 0xffff
8502; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
8503; GFX908-NEXT:    v_not_b32_e32 v6, v3
8504; GFX908-NEXT:    s_mov_b64 s[4:5], 0
8505; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
8506; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
8507; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8508; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
8509; GFX908-NEXT:    v_sub_f16_e32 v3, v3, v2
8510; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
8511; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
8512; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
8513; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8514; GFX908-NEXT:    buffer_wbinvl1
8515; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
8516; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8517; GFX908-NEXT:    v_mov_b32_e32 v4, v3
8518; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8519; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
8520; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
8521; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
8522; GFX908-NEXT:    s_setpc_b64 s[30:31]
8523;
8524; GFX8-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
8525; GFX8:       ; %bb.0:
8526; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8527; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
8528; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8529; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
8530; GFX8-NEXT:    flat_load_dword v4, v[0:1]
8531; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
8532; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
8533; GFX8-NEXT:    s_mov_b32 s4, 0xffff
8534; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
8535; GFX8-NEXT:    v_not_b32_e32 v6, v3
8536; GFX8-NEXT:    s_mov_b64 s[4:5], 0
8537; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
8538; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8539; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8540; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
8541; GFX8-NEXT:    v_sub_f16_e32 v3, v3, v2
8542; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
8543; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
8544; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
8545; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
8546; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8547; GFX8-NEXT:    buffer_wbinvl1
8548; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
8549; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8550; GFX8-NEXT:    v_mov_b32_e32 v4, v3
8551; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8552; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
8553; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8554; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8555; GFX8-NEXT:    s_setpc_b64 s[30:31]
8556;
8557; GFX7-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
8558; GFX7:       ; %bb.0:
8559; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8560; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
8561; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8562; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
8563; GFX7-NEXT:    flat_load_dword v3, v[0:1]
8564; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
8565; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
8566; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
8567; GFX7-NEXT:    s_mov_b64 s[4:5], 0
8568; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
8569; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
8570; GFX7-NEXT:    v_not_b32_e32 v6, v2
8571; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
8572; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8573; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8574; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
8575; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
8576; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
8577; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v5
8578; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
8579; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
8580; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
8581; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8582; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8583; GFX7-NEXT:    buffer_wbinvl1
8584; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8585; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8586; GFX7-NEXT:    v_mov_b32_e32 v3, v2
8587; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8588; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
8589; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
8590; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8591; GFX7-NEXT:    s_setpc_b64 s[30:31]
8592  %gep = getelementptr half, ptr %ptr, i64 1023
8593  %unused = atomicrmw fsub ptr %gep, half %val seq_cst
8594  ret void
8595}
8596
8597; --------------------------------------------------------------------
8598; bfloat
8599; --------------------------------------------------------------------
8600
8601define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
8602; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16:
8603; GFX12:       ; %bb.0:
8604; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8605; GFX12-NEXT:    s_wait_expcnt 0x0
8606; GFX12-NEXT:    s_wait_samplecnt 0x0
8607; GFX12-NEXT:    s_wait_bvhcnt 0x0
8608; GFX12-NEXT:    s_wait_kmcnt 0x0
8609; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
8610; GFX12-NEXT:    s_mov_b32 s0, 0
8611; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
8612; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
8613; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
8614; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
8615; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8616; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8617; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8618; GFX12-NEXT:    v_not_b32_e32 v4, v4
8619; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
8620; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
8621; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8622; GFX12-NEXT:    v_mov_b32_e32 v6, v5
8623; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8624; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8625; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
8626; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8627; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
8628; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
8629; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
8630; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
8631; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
8632; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
8633; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
8634; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8635; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
8636; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8637; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8638; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
8639; GFX12-NEXT:    s_wait_storecnt 0x0
8640; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
8641; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8642; GFX12-NEXT:    global_inv scope:SCOPE_DEV
8643; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8644; GFX12-NEXT:    s_wait_alu 0xfffe
8645; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
8646; GFX12-NEXT:    s_wait_alu 0xfffe
8647; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8648; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
8649; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
8650; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8651; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8652; GFX12-NEXT:    s_wait_alu 0xfffe
8653; GFX12-NEXT:    s_setpc_b64 s[30:31]
8654;
8655; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16:
8656; GFX940:       ; %bb.0:
8657; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8658; GFX940-NEXT:    v_mov_b32_e32 v3, v0
8659; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
8660; GFX940-NEXT:    flat_load_dword v5, v[0:1]
8661; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
8662; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8663; GFX940-NEXT:    s_mov_b32 s0, 0xffff
8664; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
8665; GFX940-NEXT:    v_not_b32_e32 v4, v4
8666; GFX940-NEXT:    s_mov_b64 s[0:1], 0
8667; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
8668; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
8669; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
8670; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
8671; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8672; GFX940-NEXT:    v_mov_b32_e32 v7, v5
8673; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
8674; GFX940-NEXT:    s_nop 0
8675; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
8676; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
8677; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
8678; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
8679; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
8680; GFX940-NEXT:    s_nop 1
8681; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
8682; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
8683; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
8684; GFX940-NEXT:    buffer_wbl2 sc1
8685; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
8686; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8687; GFX940-NEXT:    buffer_inv sc1
8688; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
8689; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8690; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8691; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
8692; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
8693; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
8694; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8695; GFX940-NEXT:    s_setpc_b64 s[30:31]
8696;
8697; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16:
8698; GFX11:       ; %bb.0:
8699; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8700; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
8701; GFX11-NEXT:    s_mov_b32 s0, 0
8702; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
8703; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
8704; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
8705; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
8706; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8707; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8708; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8709; GFX11-NEXT:    v_not_b32_e32 v4, v4
8710; GFX11-NEXT:    .p2align 6
8711; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
8712; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
8713; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8714; GFX11-NEXT:    v_mov_b32_e32 v6, v5
8715; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8716; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8717; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
8718; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8719; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
8720; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
8721; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
8722; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
8723; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
8724; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
8725; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
8726; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8727; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
8728; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8729; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8730; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
8731; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
8732; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
8733; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8734; GFX11-NEXT:    buffer_gl1_inv
8735; GFX11-NEXT:    buffer_gl0_inv
8736; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8737; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
8738; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8739; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8740; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
8741; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
8742; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8743; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8744; GFX11-NEXT:    s_setpc_b64 s[30:31]
8745;
8746; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16:
8747; GFX10:       ; %bb.0:
8748; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8749; GFX10-NEXT:    v_mov_b32_e32 v3, v0
8750; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
8751; GFX10-NEXT:    s_mov_b32 s4, 0
8752; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
8753; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
8754; GFX10-NEXT:    flat_load_dword v5, v[0:1]
8755; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8756; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8757; GFX10-NEXT:    v_not_b32_e32 v4, v4
8758; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
8759; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
8760; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8761; GFX10-NEXT:    v_mov_b32_e32 v6, v5
8762; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
8763; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
8764; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
8765; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
8766; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
8767; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
8768; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
8769; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
8770; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
8771; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
8772; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8773; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8774; GFX10-NEXT:    buffer_gl1_inv
8775; GFX10-NEXT:    buffer_gl0_inv
8776; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8777; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
8778; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
8779; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
8780; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
8781; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
8782; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8783; GFX10-NEXT:    s_setpc_b64 s[30:31]
8784;
8785; GFX90A-LABEL: flat_agent_atomic_fsub_ret_bf16:
8786; GFX90A:       ; %bb.0:
8787; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8788; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
8789; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
8790; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
8791; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
8792; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8793; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
8794; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
8795; GFX90A-NEXT:    v_not_b32_e32 v4, v4
8796; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
8797; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
8798; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
8799; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
8800; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
8801; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8802; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
8803; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
8804; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
8805; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
8806; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
8807; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
8808; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
8809; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
8810; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
8811; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
8812; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
8813; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8814; GFX90A-NEXT:    buffer_wbinvl1
8815; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
8816; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8817; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8818; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
8819; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
8820; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
8821; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8822; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8823;
8824; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16:
8825; GFX908:       ; %bb.0:
8826; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8827; GFX908-NEXT:    v_mov_b32_e32 v3, v0
8828; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
8829; GFX908-NEXT:    flat_load_dword v5, v[0:1]
8830; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
8831; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8832; GFX908-NEXT:    s_mov_b32 s4, 0xffff
8833; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
8834; GFX908-NEXT:    v_not_b32_e32 v4, v4
8835; GFX908-NEXT:    s_mov_b64 s[4:5], 0
8836; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
8837; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
8838; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
8839; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
8840; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8841; GFX908-NEXT:    v_mov_b32_e32 v6, v5
8842; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
8843; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
8844; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
8845; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
8846; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
8847; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
8848; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
8849; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
8850; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
8851; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8852; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8853; GFX908-NEXT:    buffer_wbinvl1
8854; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8855; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8856; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8857; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
8858; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
8859; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
8860; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8861; GFX908-NEXT:    s_setpc_b64 s[30:31]
8862;
8863; GFX8-LABEL: flat_agent_atomic_fsub_ret_bf16:
8864; GFX8:       ; %bb.0:
8865; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8866; GFX8-NEXT:    v_mov_b32_e32 v3, v0
8867; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
8868; GFX8-NEXT:    flat_load_dword v5, v[0:1]
8869; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
8870; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8871; GFX8-NEXT:    s_mov_b32 s4, 0xffff
8872; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
8873; GFX8-NEXT:    v_not_b32_e32 v4, v4
8874; GFX8-NEXT:    s_mov_b64 s[4:5], 0
8875; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
8876; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
8877; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8878; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8879; GFX8-NEXT:    v_mov_b32_e32 v6, v5
8880; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
8881; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
8882; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
8883; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
8884; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
8885; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
8886; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
8887; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
8888; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
8889; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
8890; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
8891; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8892; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8893; GFX8-NEXT:    buffer_wbinvl1
8894; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8895; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8896; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8897; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
8898; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8899; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8900; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8901; GFX8-NEXT:    s_setpc_b64 s[30:31]
8902;
8903; GFX7-LABEL: flat_agent_atomic_fsub_ret_bf16:
8904; GFX7:       ; %bb.0:
8905; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8906; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8907; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
8908; GFX7-NEXT:    flat_load_dword v5, v[0:1]
8909; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
8910; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8911; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
8912; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
8913; GFX7-NEXT:    v_not_b32_e32 v4, v4
8914; GFX7-NEXT:    s_mov_b64 s[4:5], 0
8915; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
8916; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
8917; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8918; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8919; GFX7-NEXT:    v_mov_b32_e32 v6, v5
8920; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8921; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
8922; GFX7-NEXT:    v_sub_f32_e32 v5, v5, v2
8923; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
8924; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
8925; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8926; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
8927; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8928; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8929; GFX7-NEXT:    buffer_wbinvl1
8930; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8931; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8932; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8933; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
8934; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
8935; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8936; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8937; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
8938; GFX7-NEXT:    s_setpc_b64 s[30:31]
8939  %result = atomicrmw fsub ptr %ptr, bfloat %val syncscope("agent") seq_cst
8940  ret bfloat %result
8941}
8942
8943define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
8944; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
8945; GFX12:       ; %bb.0:
8946; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8947; GFX12-NEXT:    s_wait_expcnt 0x0
8948; GFX12-NEXT:    s_wait_samplecnt 0x0
8949; GFX12-NEXT:    s_wait_bvhcnt 0x0
8950; GFX12-NEXT:    s_wait_kmcnt 0x0
8951; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
8952; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8953; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
8954; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
8955; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
8956; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
8957; GFX12-NEXT:    s_mov_b32 s0, 0
8958; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
8959; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8960; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8961; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8962; GFX12-NEXT:    v_not_b32_e32 v4, v4
8963; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
8964; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
8965; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8966; GFX12-NEXT:    v_mov_b32_e32 v6, v5
8967; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8968; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8969; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
8970; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8971; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
8972; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
8973; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
8974; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
8975; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
8976; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
8977; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
8978; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8979; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
8980; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8981; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8982; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
8983; GFX12-NEXT:    s_wait_storecnt 0x0
8984; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
8985; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8986; GFX12-NEXT:    global_inv scope:SCOPE_DEV
8987; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8988; GFX12-NEXT:    s_wait_alu 0xfffe
8989; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
8990; GFX12-NEXT:    s_wait_alu 0xfffe
8991; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8992; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
8993; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
8994; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8995; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8996; GFX12-NEXT:    s_wait_alu 0xfffe
8997; GFX12-NEXT:    s_setpc_b64 s[30:31]
8998;
8999; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
9000; GFX940:       ; %bb.0:
9001; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9002; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
9003; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
9004; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
9005; GFX940-NEXT:    v_mov_b32_e32 v1, v5
9006; GFX940-NEXT:    flat_load_dword v5, v[0:1]
9007; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
9008; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9009; GFX940-NEXT:    s_mov_b32 s0, 0xffff
9010; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
9011; GFX940-NEXT:    v_not_b32_e32 v4, v4
9012; GFX940-NEXT:    s_mov_b64 s[0:1], 0
9013; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9014; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
9015; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
9016; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
9017; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9018; GFX940-NEXT:    v_mov_b32_e32 v7, v5
9019; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9020; GFX940-NEXT:    s_nop 0
9021; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
9022; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
9023; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9024; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
9025; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9026; GFX940-NEXT:    s_nop 1
9027; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
9028; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9029; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
9030; GFX940-NEXT:    buffer_wbl2 sc1
9031; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
9032; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9033; GFX940-NEXT:    buffer_inv sc1
9034; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
9035; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
9036; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
9037; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
9038; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
9039; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
9040; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9041; GFX940-NEXT:    s_setpc_b64 s[30:31]
9042;
9043; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
9044; GFX11:       ; %bb.0:
9045; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9046; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
9047; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
9048; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9049; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
9050; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
9051; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
9052; GFX11-NEXT:    s_mov_b32 s0, 0
9053; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
9054; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9055; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
9056; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9057; GFX11-NEXT:    v_not_b32_e32 v4, v4
9058; GFX11-NEXT:    .p2align 6
9059; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
9060; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
9061; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9062; GFX11-NEXT:    v_mov_b32_e32 v6, v5
9063; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9064; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
9065; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
9066; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9067; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
9068; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
9069; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9070; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9071; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9072; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
9073; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
9074; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9075; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9076; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
9077; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9078; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
9079; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
9080; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
9081; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9082; GFX11-NEXT:    buffer_gl1_inv
9083; GFX11-NEXT:    buffer_gl0_inv
9084; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
9085; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
9086; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9087; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9088; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
9089; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
9090; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9091; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9092; GFX11-NEXT:    s_setpc_b64 s[30:31]
9093;
9094; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
9095; GFX10:       ; %bb.0:
9096; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9097; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
9098; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
9099; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9100; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
9101; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
9102; GFX10-NEXT:    s_mov_b32 s4, 0
9103; GFX10-NEXT:    flat_load_dword v5, v[0:1]
9104; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9105; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
9106; GFX10-NEXT:    v_not_b32_e32 v4, v4
9107; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
9108; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
9109; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9110; GFX10-NEXT:    v_mov_b32_e32 v6, v5
9111; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9112; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
9113; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
9114; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9115; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9116; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
9117; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
9118; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9119; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
9120; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
9121; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9122; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9123; GFX10-NEXT:    buffer_gl1_inv
9124; GFX10-NEXT:    buffer_gl0_inv
9125; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
9126; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
9127; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
9128; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
9129; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
9130; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
9131; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9132; GFX10-NEXT:    s_setpc_b64 s[30:31]
9133;
9134; GFX90A-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
9135; GFX90A:       ; %bb.0:
9136; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9137; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
9138; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9139; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
9140; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
9141; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
9142; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9143; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
9144; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9145; GFX90A-NEXT:    v_not_b32_e32 v4, v4
9146; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
9147; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9148; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
9149; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
9150; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
9151; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9152; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
9153; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9154; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
9155; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
9156; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9157; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
9158; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9159; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
9160; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9161; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
9162; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
9163; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9164; GFX90A-NEXT:    buffer_wbinvl1
9165; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
9166; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9167; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9168; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
9169; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
9170; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
9171; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9172; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9173;
9174; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
9175; GFX908:       ; %bb.0:
9176; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9177; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
9178; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9179; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
9180; GFX908-NEXT:    flat_load_dword v5, v[0:1]
9181; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
9182; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9183; GFX908-NEXT:    s_mov_b32 s4, 0xffff
9184; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9185; GFX908-NEXT:    v_not_b32_e32 v4, v4
9186; GFX908-NEXT:    s_mov_b64 s[4:5], 0
9187; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9188; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
9189; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
9190; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
9191; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9192; GFX908-NEXT:    v_mov_b32_e32 v6, v5
9193; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9194; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
9195; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
9196; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9197; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
9198; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9199; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
9200; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9201; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
9202; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9203; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9204; GFX908-NEXT:    buffer_wbinvl1
9205; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9206; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9207; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9208; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
9209; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
9210; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
9211; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9212; GFX908-NEXT:    s_setpc_b64 s[30:31]
9213;
9214; GFX8-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
9215; GFX8:       ; %bb.0:
9216; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9217; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
9218; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9219; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
9220; GFX8-NEXT:    flat_load_dword v5, v[0:1]
9221; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
9222; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9223; GFX8-NEXT:    s_mov_b32 s4, 0xffff
9224; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9225; GFX8-NEXT:    v_not_b32_e32 v4, v4
9226; GFX8-NEXT:    s_mov_b64 s[4:5], 0
9227; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9228; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
9229; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
9230; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9231; GFX8-NEXT:    v_mov_b32_e32 v6, v5
9232; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9233; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
9234; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
9235; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
9236; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
9237; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
9238; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9239; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
9240; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
9241; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9242; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
9243; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9244; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9245; GFX8-NEXT:    buffer_wbinvl1
9246; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9247; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9248; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9249; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
9250; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
9251; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
9252; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9253; GFX8-NEXT:    s_setpc_b64 s[30:31]
9254;
9255; GFX7-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
9256; GFX7:       ; %bb.0:
9257; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9258; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
9259; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9260; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
9261; GFX7-NEXT:    flat_load_dword v5, v[0:1]
9262; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
9263; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9264; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
9265; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9266; GFX7-NEXT:    v_not_b32_e32 v4, v4
9267; GFX7-NEXT:    s_mov_b64 s[4:5], 0
9268; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9269; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
9270; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
9271; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9272; GFX7-NEXT:    v_mov_b32_e32 v6, v5
9273; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
9274; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
9275; GFX7-NEXT:    v_sub_f32_e32 v5, v5, v2
9276; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9277; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
9278; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
9279; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
9280; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9281; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9282; GFX7-NEXT:    buffer_wbinvl1
9283; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9284; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9285; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9286; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
9287; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
9288; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
9289; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9290; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
9291; GFX7-NEXT:    s_setpc_b64 s[30:31]
9292  %gep = getelementptr bfloat, ptr %ptr, i64 1023
9293  %result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst
9294  ret bfloat %result
9295}
9296
9297define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 {
9298; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
9299; GFX12:       ; %bb.0:
9300; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9301; GFX12-NEXT:    s_wait_expcnt 0x0
9302; GFX12-NEXT:    s_wait_samplecnt 0x0
9303; GFX12-NEXT:    s_wait_bvhcnt 0x0
9304; GFX12-NEXT:    s_wait_kmcnt 0x0
9305; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
9306; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
9307; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9308; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
9309; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
9310; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
9311; GFX12-NEXT:    s_mov_b32 s0, 0
9312; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
9313; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9314; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
9315; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9316; GFX12-NEXT:    v_not_b32_e32 v4, v4
9317; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
9318; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
9319; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9320; GFX12-NEXT:    v_mov_b32_e32 v6, v5
9321; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9322; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
9323; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
9324; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9325; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
9326; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
9327; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9328; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9329; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9330; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
9331; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
9332; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9333; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9334; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
9335; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9336; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
9337; GFX12-NEXT:    s_wait_storecnt 0x0
9338; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
9339; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9340; GFX12-NEXT:    global_inv scope:SCOPE_DEV
9341; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
9342; GFX12-NEXT:    s_wait_alu 0xfffe
9343; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
9344; GFX12-NEXT:    s_wait_alu 0xfffe
9345; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9346; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
9347; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
9348; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9349; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9350; GFX12-NEXT:    s_wait_alu 0xfffe
9351; GFX12-NEXT:    s_setpc_b64 s[30:31]
9352;
9353; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
9354; GFX940:       ; %bb.0:
9355; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9356; GFX940-NEXT:    s_movk_i32 s0, 0xf800
9357; GFX940-NEXT:    s_mov_b32 s1, -1
9358; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
9359; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
9360; GFX940-NEXT:    v_mov_b32_e32 v1, v5
9361; GFX940-NEXT:    flat_load_dword v5, v[0:1]
9362; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
9363; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9364; GFX940-NEXT:    s_mov_b32 s0, 0xffff
9365; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
9366; GFX940-NEXT:    v_not_b32_e32 v4, v4
9367; GFX940-NEXT:    s_mov_b64 s[0:1], 0
9368; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9369; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
9370; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
9371; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
9372; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9373; GFX940-NEXT:    v_mov_b32_e32 v7, v5
9374; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9375; GFX940-NEXT:    s_nop 0
9376; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
9377; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
9378; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9379; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
9380; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9381; GFX940-NEXT:    s_nop 1
9382; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
9383; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9384; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
9385; GFX940-NEXT:    buffer_wbl2 sc1
9386; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
9387; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9388; GFX940-NEXT:    buffer_inv sc1
9389; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
9390; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
9391; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
9392; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
9393; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
9394; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
9395; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9396; GFX940-NEXT:    s_setpc_b64 s[30:31]
9397;
9398; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
9399; GFX11:       ; %bb.0:
9400; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9401; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
9402; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
9403; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9404; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
9405; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
9406; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
9407; GFX11-NEXT:    s_mov_b32 s0, 0
9408; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
9409; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9410; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
9411; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9412; GFX11-NEXT:    v_not_b32_e32 v4, v4
9413; GFX11-NEXT:    .p2align 6
9414; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
9415; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
9416; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9417; GFX11-NEXT:    v_mov_b32_e32 v6, v5
9418; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9419; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
9420; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
9421; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9422; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
9423; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
9424; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9425; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9426; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9427; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
9428; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
9429; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9430; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9431; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
9432; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9433; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
9434; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
9435; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
9436; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9437; GFX11-NEXT:    buffer_gl1_inv
9438; GFX11-NEXT:    buffer_gl0_inv
9439; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
9440; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
9441; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9442; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9443; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
9444; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
9445; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9446; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9447; GFX11-NEXT:    s_setpc_b64 s[30:31]
9448;
9449; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
9450; GFX10:       ; %bb.0:
9451; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9452; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
9453; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
9454; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9455; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
9456; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
9457; GFX10-NEXT:    s_mov_b32 s4, 0
9458; GFX10-NEXT:    flat_load_dword v5, v[0:1]
9459; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9460; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
9461; GFX10-NEXT:    v_not_b32_e32 v4, v4
9462; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
9463; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
9464; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9465; GFX10-NEXT:    v_mov_b32_e32 v6, v5
9466; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9467; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
9468; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
9469; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9470; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9471; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
9472; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
9473; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9474; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
9475; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
9476; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9477; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9478; GFX10-NEXT:    buffer_gl1_inv
9479; GFX10-NEXT:    buffer_gl0_inv
9480; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
9481; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
9482; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
9483; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
9484; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
9485; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
9486; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9487; GFX10-NEXT:    s_setpc_b64 s[30:31]
9488;
9489; GFX90A-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
9490; GFX90A:       ; %bb.0:
9491; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9492; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
9493; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
9494; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
9495; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
9496; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
9497; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9498; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
9499; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9500; GFX90A-NEXT:    v_not_b32_e32 v4, v4
9501; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
9502; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9503; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
9504; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
9505; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
9506; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9507; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
9508; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9509; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
9510; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
9511; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9512; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
9513; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9514; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
9515; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9516; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
9517; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
9518; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9519; GFX90A-NEXT:    buffer_wbinvl1
9520; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
9521; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9522; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9523; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
9524; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
9525; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
9526; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9527; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9528;
9529; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
9530; GFX908:       ; %bb.0:
9531; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9532; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
9533; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
9534; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
9535; GFX908-NEXT:    flat_load_dword v5, v[0:1]
9536; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
9537; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9538; GFX908-NEXT:    s_mov_b32 s4, 0xffff
9539; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9540; GFX908-NEXT:    v_not_b32_e32 v4, v4
9541; GFX908-NEXT:    s_mov_b64 s[4:5], 0
9542; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9543; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
9544; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
9545; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
9546; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9547; GFX908-NEXT:    v_mov_b32_e32 v6, v5
9548; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9549; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
9550; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
9551; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9552; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
9553; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9554; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
9555; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9556; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
9557; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9558; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9559; GFX908-NEXT:    buffer_wbinvl1
9560; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9561; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9562; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9563; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
9564; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
9565; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
9566; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9567; GFX908-NEXT:    s_setpc_b64 s[30:31]
9568;
9569; GFX8-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
9570; GFX8:       ; %bb.0:
9571; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9572; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
9573; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
9574; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
9575; GFX8-NEXT:    flat_load_dword v5, v[0:1]
9576; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
9577; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9578; GFX8-NEXT:    s_mov_b32 s4, 0xffff
9579; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9580; GFX8-NEXT:    v_not_b32_e32 v4, v4
9581; GFX8-NEXT:    s_mov_b64 s[4:5], 0
9582; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9583; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
9584; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
9585; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9586; GFX8-NEXT:    v_mov_b32_e32 v6, v5
9587; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9588; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
9589; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
9590; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
9591; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
9592; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
9593; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9594; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
9595; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
9596; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9597; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
9598; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9599; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9600; GFX8-NEXT:    buffer_wbinvl1
9601; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9602; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9603; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9604; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
9605; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
9606; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
9607; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9608; GFX8-NEXT:    s_setpc_b64 s[30:31]
9609;
9610; GFX7-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
9611; GFX7:       ; %bb.0:
9612; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9613; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
9614; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
9615; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
9616; GFX7-NEXT:    flat_load_dword v5, v[0:1]
9617; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
9618; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9619; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
9620; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9621; GFX7-NEXT:    v_not_b32_e32 v4, v4
9622; GFX7-NEXT:    s_mov_b64 s[4:5], 0
9623; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9624; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
9625; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
9626; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9627; GFX7-NEXT:    v_mov_b32_e32 v6, v5
9628; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
9629; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
9630; GFX7-NEXT:    v_sub_f32_e32 v5, v5, v2
9631; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9632; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
9633; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
9634; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
9635; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9636; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9637; GFX7-NEXT:    buffer_wbinvl1
9638; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9639; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9640; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9641; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
9642; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
9643; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
9644; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9645; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
9646; GFX7-NEXT:    s_setpc_b64 s[30:31]
9647  %gep = getelementptr bfloat, ptr %ptr, i64 -1024
9648  %result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst
9649  ret bfloat %result
9650 }
9651
9652define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
9653; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16:
9654; GFX12:       ; %bb.0:
9655; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9656; GFX12-NEXT:    s_wait_expcnt 0x0
9657; GFX12-NEXT:    s_wait_samplecnt 0x0
9658; GFX12-NEXT:    s_wait_bvhcnt 0x0
9659; GFX12-NEXT:    s_wait_kmcnt 0x0
9660; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
9661; GFX12-NEXT:    s_mov_b32 s0, 0
9662; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
9663; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
9664; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
9665; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
9666; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9667; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
9668; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9669; GFX12-NEXT:    v_not_b32_e32 v6, v3
9670; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
9671; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
9672; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9673; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9674; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9675; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
9676; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v2
9677; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
9678; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
9679; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
9680; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
9681; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
9682; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9683; GFX12-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
9684; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
9685; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9686; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9687; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
9688; GFX12-NEXT:    s_wait_storecnt 0x0
9689; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
9690; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9691; GFX12-NEXT:    global_inv scope:SCOPE_DEV
9692; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9693; GFX12-NEXT:    v_mov_b32_e32 v4, v3
9694; GFX12-NEXT:    s_wait_alu 0xfffe
9695; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
9696; GFX12-NEXT:    s_wait_alu 0xfffe
9697; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9698; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
9699; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
9700; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9701; GFX12-NEXT:    s_wait_alu 0xfffe
9702; GFX12-NEXT:    s_setpc_b64 s[30:31]
9703;
9704; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16:
9705; GFX940:       ; %bb.0:
9706; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9707; GFX940-NEXT:    v_mov_b32_e32 v3, v0
9708; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
9709; GFX940-NEXT:    flat_load_dword v5, v[0:1]
9710; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
9711; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9712; GFX940-NEXT:    s_mov_b32 s0, 0xffff
9713; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
9714; GFX940-NEXT:    v_not_b32_e32 v6, v4
9715; GFX940-NEXT:    s_mov_b64 s[0:1], 0
9716; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9717; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
9718; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
9719; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
9720; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9721; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9722; GFX940-NEXT:    s_nop 0
9723; GFX940-NEXT:    v_sub_f32_e32 v4, v4, v2
9724; GFX940-NEXT:    v_bfe_u32 v7, v4, 16, 1
9725; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
9726; GFX940-NEXT:    v_add3_u32 v7, v7, v4, s2
9727; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
9728; GFX940-NEXT:    s_nop 1
9729; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
9730; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9731; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
9732; GFX940-NEXT:    buffer_wbl2 sc1
9733; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
9734; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9735; GFX940-NEXT:    buffer_inv sc1
9736; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
9737; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
9738; GFX940-NEXT:    v_mov_b32_e32 v5, v4
9739; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
9740; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
9741; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
9742; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
9743; GFX940-NEXT:    s_setpc_b64 s[30:31]
9744;
9745; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16:
9746; GFX11:       ; %bb.0:
9747; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9748; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
9749; GFX11-NEXT:    s_mov_b32 s0, 0
9750; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
9751; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
9752; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
9753; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
9754; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9755; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
9756; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9757; GFX11-NEXT:    v_not_b32_e32 v6, v3
9758; GFX11-NEXT:    .p2align 6
9759; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
9760; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
9761; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9762; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9763; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9764; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
9765; GFX11-NEXT:    v_sub_f32_e32 v3, v3, v2
9766; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
9767; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
9768; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
9769; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
9770; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
9771; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9772; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
9773; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
9774; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9775; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9776; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
9777; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
9778; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
9779; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9780; GFX11-NEXT:    buffer_gl1_inv
9781; GFX11-NEXT:    buffer_gl0_inv
9782; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9783; GFX11-NEXT:    v_mov_b32_e32 v4, v3
9784; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
9785; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9786; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9787; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
9788; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
9789; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9790; GFX11-NEXT:    s_setpc_b64 s[30:31]
9791;
9792; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16:
9793; GFX10:       ; %bb.0:
9794; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9795; GFX10-NEXT:    v_mov_b32_e32 v3, v0
9796; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9797; GFX10-NEXT:    s_mov_b32 s4, 0
9798; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
9799; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
9800; GFX10-NEXT:    flat_load_dword v4, v[0:1]
9801; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9802; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
9803; GFX10-NEXT:    v_not_b32_e32 v6, v3
9804; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
9805; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
9806; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9807; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9808; GFX10-NEXT:    v_sub_f32_e32 v3, v3, v2
9809; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
9810; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
9811; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
9812; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
9813; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
9814; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9815; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
9816; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
9817; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9818; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9819; GFX10-NEXT:    buffer_gl1_inv
9820; GFX10-NEXT:    buffer_gl0_inv
9821; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9822; GFX10-NEXT:    v_mov_b32_e32 v4, v3
9823; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
9824; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
9825; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
9826; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
9827; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
9828; GFX10-NEXT:    s_setpc_b64 s[30:31]
9829;
9830; GFX90A-LABEL: flat_agent_atomic_fsub_noret_bf16:
9831; GFX90A:       ; %bb.0:
9832; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9833; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
9834; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
9835; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
9836; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
9837; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9838; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
9839; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9840; GFX90A-NEXT:    v_not_b32_e32 v6, v4
9841; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
9842; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9843; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
9844; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
9845; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
9846; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9847; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9848; GFX90A-NEXT:    v_sub_f32_e32 v4, v4, v2
9849; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
9850; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
9851; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s6
9852; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
9853; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
9854; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9855; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
9856; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
9857; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9858; GFX90A-NEXT:    buffer_wbinvl1
9859; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
9860; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9861; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
9862; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9863; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
9864; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
9865; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
9866; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9867;
9868; GFX908-LABEL: flat_agent_atomic_fsub_noret_bf16:
9869; GFX908:       ; %bb.0:
9870; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9871; GFX908-NEXT:    v_mov_b32_e32 v3, v0
9872; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
9873; GFX908-NEXT:    flat_load_dword v4, v[0:1]
9874; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
9875; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9876; GFX908-NEXT:    s_mov_b32 s4, 0xffff
9877; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
9878; GFX908-NEXT:    v_not_b32_e32 v6, v3
9879; GFX908-NEXT:    s_mov_b64 s[4:5], 0
9880; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9881; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
9882; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
9883; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
9884; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9885; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9886; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v2
9887; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
9888; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
9889; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s6
9890; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
9891; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
9892; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9893; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
9894; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9895; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9896; GFX908-NEXT:    buffer_wbinvl1
9897; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
9898; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9899; GFX908-NEXT:    v_mov_b32_e32 v4, v3
9900; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9901; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
9902; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
9903; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
9904; GFX908-NEXT:    s_setpc_b64 s[30:31]
9905;
9906; GFX8-LABEL: flat_agent_atomic_fsub_noret_bf16:
9907; GFX8:       ; %bb.0:
9908; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9909; GFX8-NEXT:    v_mov_b32_e32 v3, v0
9910; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
9911; GFX8-NEXT:    flat_load_dword v4, v[0:1]
9912; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
9913; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9914; GFX8-NEXT:    s_mov_b32 s4, 0xffff
9915; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
9916; GFX8-NEXT:    v_not_b32_e32 v6, v3
9917; GFX8-NEXT:    s_mov_b64 s[4:5], 0
9918; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9919; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
9920; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
9921; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9922; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9923; GFX8-NEXT:    v_sub_f32_e32 v3, v3, v2
9924; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
9925; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
9926; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
9927; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
9928; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
9929; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
9930; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
9931; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9932; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
9933; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9934; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9935; GFX8-NEXT:    buffer_wbinvl1
9936; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
9937; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9938; GFX8-NEXT:    v_mov_b32_e32 v4, v3
9939; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9940; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
9941; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
9942; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
9943; GFX8-NEXT:    s_setpc_b64 s[30:31]
9944;
9945; GFX7-LABEL: flat_agent_atomic_fsub_noret_bf16:
9946; GFX7:       ; %bb.0:
9947; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9948; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9949; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
9950; GFX7-NEXT:    flat_load_dword v4, v[0:1]
9951; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
9952; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9953; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
9954; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9955; GFX7-NEXT:    v_not_b32_e32 v6, v3
9956; GFX7-NEXT:    s_mov_b64 s[4:5], 0
9957; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9958; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
9959; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
9960; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9961; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9962; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
9963; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
9964; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
9965; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
9966; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9967; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
9968; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9969; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9970; GFX7-NEXT:    buffer_wbinvl1
9971; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
9972; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9973; GFX7-NEXT:    v_mov_b32_e32 v4, v3
9974; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9975; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
9976; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
9977; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
9978; GFX7-NEXT:    s_setpc_b64 s[30:31]
9979  %unused = atomicrmw fsub ptr %ptr, bfloat %val syncscope("agent") seq_cst
9980  ret void
9981}
9982
9983define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
9984; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
9985; GFX12:       ; %bb.0:
9986; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9987; GFX12-NEXT:    s_wait_expcnt 0x0
9988; GFX12-NEXT:    s_wait_samplecnt 0x0
9989; GFX12-NEXT:    s_wait_bvhcnt 0x0
9990; GFX12-NEXT:    s_wait_kmcnt 0x0
9991; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
9992; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
9993; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
9994; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
9995; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
9996; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
9997; GFX12-NEXT:    s_mov_b32 s0, 0
9998; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
9999; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10000; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
10001; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10002; GFX12-NEXT:    v_not_b32_e32 v5, v5
10003; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
10004; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
10005; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10006; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
10007; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10008; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10009; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v6
10010; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10011; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
10012; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10013; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10014; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
10015; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10016; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
10017; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10018; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10019; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
10020; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
10021; GFX12-NEXT:    s_wait_storecnt 0x0
10022; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
10023; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10024; GFX12-NEXT:    global_inv scope:SCOPE_DEV
10025; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
10026; GFX12-NEXT:    v_mov_b32_e32 v3, v2
10027; GFX12-NEXT:    s_wait_alu 0xfffe
10028; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
10029; GFX12-NEXT:    s_wait_alu 0xfffe
10030; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10031; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
10032; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
10033; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10034; GFX12-NEXT:    s_wait_alu 0xfffe
10035; GFX12-NEXT:    s_setpc_b64 s[30:31]
10036;
10037; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
10038; GFX940:       ; %bb.0:
10039; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10040; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
10041; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
10042; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
10043; GFX940-NEXT:    v_mov_b32_e32 v1, v5
10044; GFX940-NEXT:    flat_load_dword v3, v[0:1]
10045; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
10046; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10047; GFX940-NEXT:    s_mov_b32 s0, 0xffff
10048; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
10049; GFX940-NEXT:    v_not_b32_e32 v5, v5
10050; GFX940-NEXT:    s_mov_b64 s[0:1], 0
10051; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10052; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
10053; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
10054; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
10055; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10056; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10057; GFX940-NEXT:    s_nop 0
10058; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v6
10059; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
10060; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10061; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
10062; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10063; GFX940-NEXT:    s_nop 1
10064; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
10065; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10066; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
10067; GFX940-NEXT:    buffer_wbl2 sc1
10068; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
10069; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10070; GFX940-NEXT:    buffer_inv sc1
10071; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10072; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
10073; GFX940-NEXT:    v_mov_b32_e32 v3, v2
10074; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
10075; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
10076; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
10077; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
10078; GFX940-NEXT:    s_setpc_b64 s[30:31]
10079;
10080; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
10081; GFX11:       ; %bb.0:
10082; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10083; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
10084; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
10085; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10086; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
10087; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
10088; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
10089; GFX11-NEXT:    s_mov_b32 s0, 0
10090; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
10091; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10092; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
10093; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10094; GFX11-NEXT:    v_not_b32_e32 v5, v5
10095; GFX11-NEXT:    .p2align 6
10096; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
10097; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
10098; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10099; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
10100; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10101; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10102; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v6
10103; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10104; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
10105; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10106; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10107; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
10108; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10109; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
10110; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10111; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10112; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
10113; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
10114; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
10115; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
10116; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10117; GFX11-NEXT:    buffer_gl1_inv
10118; GFX11-NEXT:    buffer_gl0_inv
10119; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
10120; GFX11-NEXT:    v_mov_b32_e32 v3, v2
10121; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
10122; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10123; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10124; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
10125; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
10126; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10127; GFX11-NEXT:    s_setpc_b64 s[30:31]
10128;
10129; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
10130; GFX10:       ; %bb.0:
10131; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10132; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
10133; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
10134; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10135; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
10136; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
10137; GFX10-NEXT:    s_mov_b32 s4, 0
10138; GFX10-NEXT:    flat_load_dword v3, v[0:1]
10139; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10140; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
10141; GFX10-NEXT:    v_not_b32_e32 v5, v5
10142; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
10143; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
10144; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10145; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10146; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v6
10147; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
10148; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10149; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10150; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
10151; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
10152; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10153; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
10154; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
10155; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10156; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10157; GFX10-NEXT:    buffer_gl1_inv
10158; GFX10-NEXT:    buffer_gl0_inv
10159; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
10160; GFX10-NEXT:    v_mov_b32_e32 v3, v2
10161; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
10162; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
10163; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
10164; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
10165; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
10166; GFX10-NEXT:    s_setpc_b64 s[30:31]
10167;
10168; GFX90A-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
10169; GFX90A:       ; %bb.0:
10170; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10171; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
10172; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10173; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
10174; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
10175; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
10176; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10177; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
10178; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
10179; GFX90A-NEXT:    v_not_b32_e32 v5, v5
10180; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
10181; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10182; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
10183; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
10184; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
10185; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10186; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10187; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v6
10188; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
10189; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10190; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
10191; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10192; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
10193; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10194; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
10195; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10196; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10197; GFX90A-NEXT:    buffer_wbinvl1
10198; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10199; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10200; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
10201; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10202; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
10203; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
10204; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
10205; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10206;
10207; GFX908-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
10208; GFX908:       ; %bb.0:
10209; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10210; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
10211; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10212; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
10213; GFX908-NEXT:    flat_load_dword v3, v[0:1]
10214; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
10215; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10216; GFX908-NEXT:    s_mov_b32 s4, 0xffff
10217; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
10218; GFX908-NEXT:    v_not_b32_e32 v5, v5
10219; GFX908-NEXT:    s_mov_b64 s[4:5], 0
10220; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10221; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
10222; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
10223; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
10224; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10225; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10226; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v6
10227; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
10228; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10229; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
10230; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10231; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
10232; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10233; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
10234; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10235; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10236; GFX908-NEXT:    buffer_wbinvl1
10237; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10238; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10239; GFX908-NEXT:    v_mov_b32_e32 v3, v2
10240; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10241; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
10242; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
10243; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
10244; GFX908-NEXT:    s_setpc_b64 s[30:31]
10245;
10246; GFX8-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
10247; GFX8:       ; %bb.0:
10248; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10249; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
10250; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10251; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
10252; GFX8-NEXT:    flat_load_dword v3, v[0:1]
10253; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
10254; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10255; GFX8-NEXT:    s_mov_b32 s4, 0xffff
10256; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
10257; GFX8-NEXT:    v_not_b32_e32 v5, v5
10258; GFX8-NEXT:    s_mov_b64 s[4:5], 0
10259; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10260; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
10261; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
10262; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10263; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10264; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v6
10265; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
10266; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
10267; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
10268; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
10269; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10270; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
10271; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
10272; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10273; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
10274; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10275; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10276; GFX8-NEXT:    buffer_wbinvl1
10277; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10278; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10279; GFX8-NEXT:    v_mov_b32_e32 v3, v2
10280; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10281; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
10282; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
10283; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
10284; GFX8-NEXT:    s_setpc_b64 s[30:31]
10285;
10286; GFX7-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
10287; GFX7:       ; %bb.0:
10288; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10289; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
10290; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10291; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
10292; GFX7-NEXT:    flat_load_dword v3, v[0:1]
10293; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
10294; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10295; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
10296; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
10297; GFX7-NEXT:    v_not_b32_e32 v5, v5
10298; GFX7-NEXT:    s_mov_b64 s[4:5], 0
10299; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
10300; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
10301; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
10302; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10303; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
10304; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10305; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
10306; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10307; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
10308; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
10309; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
10310; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10311; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10312; GFX7-NEXT:    buffer_wbinvl1
10313; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10314; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10315; GFX7-NEXT:    v_mov_b32_e32 v3, v2
10316; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10317; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
10318; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
10319; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
10320; GFX7-NEXT:    s_setpc_b64 s[30:31]
10321  %gep = getelementptr bfloat, ptr %ptr, i64 1023
10322  %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst
10323  ret void
10324}
10325
10326define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 {
10327; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
10328; GFX12:       ; %bb.0:
10329; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10330; GFX12-NEXT:    s_wait_expcnt 0x0
10331; GFX12-NEXT:    s_wait_samplecnt 0x0
10332; GFX12-NEXT:    s_wait_bvhcnt 0x0
10333; GFX12-NEXT:    s_wait_kmcnt 0x0
10334; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
10335; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
10336; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10337; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
10338; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
10339; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
10340; GFX12-NEXT:    s_mov_b32 s0, 0
10341; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
10342; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10343; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
10344; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10345; GFX12-NEXT:    v_not_b32_e32 v5, v5
10346; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
10347; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
10348; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10349; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
10350; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10351; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10352; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v6
10353; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10354; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
10355; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10356; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10357; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
10358; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10359; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
10360; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10361; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10362; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
10363; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
10364; GFX12-NEXT:    s_wait_storecnt 0x0
10365; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
10366; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10367; GFX12-NEXT:    global_inv scope:SCOPE_DEV
10368; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
10369; GFX12-NEXT:    v_mov_b32_e32 v3, v2
10370; GFX12-NEXT:    s_wait_alu 0xfffe
10371; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
10372; GFX12-NEXT:    s_wait_alu 0xfffe
10373; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10374; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
10375; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
10376; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10377; GFX12-NEXT:    s_wait_alu 0xfffe
10378; GFX12-NEXT:    s_setpc_b64 s[30:31]
10379;
10380; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
10381; GFX940:       ; %bb.0:
10382; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10383; GFX940-NEXT:    s_movk_i32 s0, 0xf800
10384; GFX940-NEXT:    s_mov_b32 s1, -1
10385; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
10386; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
10387; GFX940-NEXT:    v_mov_b32_e32 v1, v5
10388; GFX940-NEXT:    flat_load_dword v3, v[0:1]
10389; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
10390; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10391; GFX940-NEXT:    s_mov_b32 s0, 0xffff
10392; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
10393; GFX940-NEXT:    v_not_b32_e32 v5, v5
10394; GFX940-NEXT:    s_mov_b64 s[0:1], 0
10395; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10396; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
10397; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
10398; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
10399; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10400; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10401; GFX940-NEXT:    s_nop 0
10402; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v6
10403; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
10404; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10405; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
10406; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10407; GFX940-NEXT:    s_nop 1
10408; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
10409; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10410; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
10411; GFX940-NEXT:    buffer_wbl2 sc1
10412; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
10413; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10414; GFX940-NEXT:    buffer_inv sc1
10415; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10416; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
10417; GFX940-NEXT:    v_mov_b32_e32 v3, v2
10418; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
10419; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
10420; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
10421; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
10422; GFX940-NEXT:    s_setpc_b64 s[30:31]
10423;
10424; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
10425; GFX11:       ; %bb.0:
10426; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10427; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
10428; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
10429; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10430; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
10431; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
10432; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
10433; GFX11-NEXT:    s_mov_b32 s0, 0
10434; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
10435; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10436; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
10437; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10438; GFX11-NEXT:    v_not_b32_e32 v5, v5
10439; GFX11-NEXT:    .p2align 6
10440; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
10441; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
10442; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10443; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
10444; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10445; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10446; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v6
10447; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10448; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
10449; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10450; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10451; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
10452; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10453; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
10454; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10455; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10456; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
10457; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
10458; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
10459; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
10460; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10461; GFX11-NEXT:    buffer_gl1_inv
10462; GFX11-NEXT:    buffer_gl0_inv
10463; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
10464; GFX11-NEXT:    v_mov_b32_e32 v3, v2
10465; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
10466; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10467; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10468; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
10469; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
10470; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10471; GFX11-NEXT:    s_setpc_b64 s[30:31]
10472;
10473; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
10474; GFX10:       ; %bb.0:
10475; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10476; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
10477; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
10478; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10479; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
10480; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
10481; GFX10-NEXT:    s_mov_b32 s4, 0
10482; GFX10-NEXT:    flat_load_dword v3, v[0:1]
10483; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10484; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
10485; GFX10-NEXT:    v_not_b32_e32 v5, v5
10486; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
10487; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
10488; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10489; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10490; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v6
10491; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
10492; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10493; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10494; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
10495; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
10496; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10497; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
10498; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
10499; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10500; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10501; GFX10-NEXT:    buffer_gl1_inv
10502; GFX10-NEXT:    buffer_gl0_inv
10503; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
10504; GFX10-NEXT:    v_mov_b32_e32 v3, v2
10505; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
10506; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
10507; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
10508; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
10509; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
10510; GFX10-NEXT:    s_setpc_b64 s[30:31]
10511;
10512; GFX90A-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
10513; GFX90A:       ; %bb.0:
10514; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10515; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
10516; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
10517; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
10518; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
10519; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
10520; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10521; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
10522; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
10523; GFX90A-NEXT:    v_not_b32_e32 v5, v5
10524; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
10525; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10526; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
10527; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
10528; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
10529; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10530; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10531; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v6
10532; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
10533; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10534; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
10535; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10536; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
10537; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10538; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
10539; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10540; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10541; GFX90A-NEXT:    buffer_wbinvl1
10542; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10543; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10544; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
10545; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10546; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
10547; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
10548; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
10549; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10550;
10551; GFX908-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
10552; GFX908:       ; %bb.0:
10553; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10554; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
10555; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
10556; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
10557; GFX908-NEXT:    flat_load_dword v3, v[0:1]
10558; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
10559; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10560; GFX908-NEXT:    s_mov_b32 s4, 0xffff
10561; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
10562; GFX908-NEXT:    v_not_b32_e32 v5, v5
10563; GFX908-NEXT:    s_mov_b64 s[4:5], 0
10564; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10565; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
10566; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
10567; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
10568; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10569; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10570; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v6
10571; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
10572; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10573; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
10574; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10575; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
10576; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10577; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
10578; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10579; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10580; GFX908-NEXT:    buffer_wbinvl1
10581; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10582; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10583; GFX908-NEXT:    v_mov_b32_e32 v3, v2
10584; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10585; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
10586; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
10587; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
10588; GFX908-NEXT:    s_setpc_b64 s[30:31]
10589;
10590; GFX8-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
10591; GFX8:       ; %bb.0:
10592; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10593; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
10594; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
10595; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
10596; GFX8-NEXT:    flat_load_dword v3, v[0:1]
10597; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
10598; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10599; GFX8-NEXT:    s_mov_b32 s4, 0xffff
10600; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
10601; GFX8-NEXT:    v_not_b32_e32 v5, v5
10602; GFX8-NEXT:    s_mov_b64 s[4:5], 0
10603; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10604; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
10605; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
10606; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10607; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10608; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v6
10609; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
10610; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
10611; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
10612; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
10613; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10614; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
10615; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
10616; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10617; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
10618; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10619; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10620; GFX8-NEXT:    buffer_wbinvl1
10621; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10622; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10623; GFX8-NEXT:    v_mov_b32_e32 v3, v2
10624; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10625; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
10626; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
10627; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
10628; GFX8-NEXT:    s_setpc_b64 s[30:31]
10629;
10630; GFX7-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
10631; GFX7:       ; %bb.0:
10632; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10633; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
10634; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
10635; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
10636; GFX7-NEXT:    flat_load_dword v3, v[0:1]
10637; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
10638; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10639; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
10640; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
10641; GFX7-NEXT:    v_not_b32_e32 v5, v5
10642; GFX7-NEXT:    s_mov_b64 s[4:5], 0
10643; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
10644; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
10645; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
10646; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10647; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
10648; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10649; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
10650; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10651; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
10652; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
10653; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
10654; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10655; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10656; GFX7-NEXT:    buffer_wbinvl1
10657; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10658; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10659; GFX7-NEXT:    v_mov_b32_e32 v3, v2
10660; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10661; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
10662; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
10663; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
10664; GFX7-NEXT:    s_setpc_b64 s[30:31]
10665  %gep = getelementptr bfloat, ptr %ptr, i64 -1024
10666  %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst
10667  ret void
10668}
10669
10670define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, bfloat %val) #0 {
10671; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
10672; GFX12:       ; %bb.0:
10673; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10674; GFX12-NEXT:    s_wait_expcnt 0x0
10675; GFX12-NEXT:    s_wait_samplecnt 0x0
10676; GFX12-NEXT:    s_wait_bvhcnt 0x0
10677; GFX12-NEXT:    s_wait_kmcnt 0x0
10678; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
10679; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10680; GFX12-NEXT:    s_mov_b32 s0, 0
10681; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
10682; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
10683; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10684; GFX12-NEXT:    v_mov_b32_e32 v4, v3
10685; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10686; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
10687; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v2
10688; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10689; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
10690; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v3
10691; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
10692; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
10693; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10694; GFX12-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
10695; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
10696; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10697; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
10698; GFX12-NEXT:    s_wait_storecnt 0x0
10699; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
10700; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10701; GFX12-NEXT:    global_inv scope:SCOPE_DEV
10702; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
10703; GFX12-NEXT:    s_wait_alu 0xfffe
10704; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
10705; GFX12-NEXT:    s_wait_alu 0xfffe
10706; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10707; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
10708; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
10709; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10710; GFX12-NEXT:    v_mov_b32_e32 v0, v3
10711; GFX12-NEXT:    s_wait_alu 0xfffe
10712; GFX12-NEXT:    s_setpc_b64 s[30:31]
10713;
10714; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
10715; GFX940:       ; %bb.0:
10716; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10717; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
10718; GFX940-NEXT:    s_mov_b64 s[0:1], 0
10719; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10720; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
10721; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
10722; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
10723; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
10724; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10725; GFX940-NEXT:    v_mov_b32_e32 v5, v3
10726; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
10727; GFX940-NEXT:    v_sub_f32_e32 v3, v3, v2
10728; GFX940-NEXT:    v_bfe_u32 v4, v3, 16, 1
10729; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
10730; GFX940-NEXT:    v_add3_u32 v4, v4, v3, s2
10731; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
10732; GFX940-NEXT:    s_nop 1
10733; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
10734; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
10735; GFX940-NEXT:    v_and_or_b32 v4, v5, s3, v3
10736; GFX940-NEXT:    buffer_wbl2 sc1
10737; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
10738; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10739; GFX940-NEXT:    buffer_inv sc1
10740; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
10741; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
10742; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
10743; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
10744; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
10745; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
10746; GFX940-NEXT:    v_mov_b32_e32 v0, v3
10747; GFX940-NEXT:    s_setpc_b64 s[30:31]
10748;
10749; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
10750; GFX11:       ; %bb.0:
10751; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10752; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
10753; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10754; GFX11-NEXT:    s_mov_b32 s0, 0
10755; GFX11-NEXT:    .p2align 6
10756; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
10757; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
10758; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10759; GFX11-NEXT:    v_mov_b32_e32 v4, v3
10760; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10761; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
10762; GFX11-NEXT:    v_sub_f32_e32 v3, v3, v2
10763; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10764; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
10765; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v3
10766; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
10767; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
10768; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10769; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
10770; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
10771; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10772; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
10773; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
10774; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
10775; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10776; GFX11-NEXT:    buffer_gl1_inv
10777; GFX11-NEXT:    buffer_gl0_inv
10778; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
10779; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
10780; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10781; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10782; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
10783; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
10784; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10785; GFX11-NEXT:    v_mov_b32_e32 v0, v3
10786; GFX11-NEXT:    s_setpc_b64 s[30:31]
10787;
10788; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
10789; GFX10:       ; %bb.0:
10790; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10791; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
10792; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
10793; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
10794; GFX10-NEXT:    s_mov_b32 s4, 0
10795; GFX10-NEXT:    flat_load_dword v0, v[3:4]
10796; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
10797; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
10798; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10799; GFX10-NEXT:    v_mov_b32_e32 v6, v0
10800; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
10801; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
10802; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
10803; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
10804; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
10805; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
10806; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc_lo
10807; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
10808; GFX10-NEXT:    v_and_or_b32 v5, 0xffff0000, v6, v0
10809; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
10810; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
10811; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10812; GFX10-NEXT:    buffer_gl1_inv
10813; GFX10-NEXT:    buffer_gl0_inv
10814; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
10815; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
10816; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
10817; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
10818; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
10819; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
10820; GFX10-NEXT:    s_setpc_b64 s[30:31]
10821;
10822; GFX90A-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
10823; GFX90A:       ; %bb.0:
10824; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10825; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
10826; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
10827; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10828; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
10829; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
10830; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
10831; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
10832; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10833; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
10834; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
10835; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v2
10836; GFX90A-NEXT:    v_bfe_u32 v4, v3, 16, 1
10837; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
10838; GFX90A-NEXT:    v_add3_u32 v4, v4, v3, s6
10839; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
10840; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
10841; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
10842; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v3
10843; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
10844; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10845; GFX90A-NEXT:    buffer_wbinvl1
10846; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
10847; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10848; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10849; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
10850; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
10851; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
10852; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
10853; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10854;
10855; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
10856; GFX908:       ; %bb.0:
10857; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10858; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
10859; GFX908-NEXT:    s_mov_b64 s[4:5], 0
10860; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10861; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
10862; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
10863; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
10864; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
10865; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10866; GFX908-NEXT:    v_mov_b32_e32 v4, v3
10867; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
10868; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v2
10869; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
10870; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
10871; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
10872; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
10873; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
10874; GFX908-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
10875; GFX908-NEXT:    v_and_or_b32 v3, v4, s7, v3
10876; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
10877; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10878; GFX908-NEXT:    buffer_wbinvl1
10879; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
10880; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10881; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10882; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
10883; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
10884; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
10885; GFX908-NEXT:    v_mov_b32_e32 v0, v3
10886; GFX908-NEXT:    s_setpc_b64 s[30:31]
10887;
10888; GFX8-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
10889; GFX8:       ; %bb.0:
10890; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10891; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
10892; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
10893; GFX8-NEXT:    flat_load_dword v0, v[3:4]
10894; GFX8-NEXT:    s_mov_b64 s[4:5], 0
10895; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
10896; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
10897; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
10898; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10899; GFX8-NEXT:    v_mov_b32_e32 v6, v0
10900; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
10901; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
10902; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
10903; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
10904; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
10905; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
10906; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
10907; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
10908; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
10909; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10910; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
10911; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10912; GFX8-NEXT:    buffer_wbinvl1
10913; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
10914; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10915; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10916; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
10917; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
10918; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
10919; GFX8-NEXT:    s_setpc_b64 s[30:31]
10920;
10921; GFX7-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
10922; GFX7:       ; %bb.0:
10923; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10924; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
10925; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10926; GFX7-NEXT:    flat_load_dword v3, v[0:1]
10927; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
10928; GFX7-NEXT:    s_mov_b64 s[4:5], 0
10929; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
10930; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
10931; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
10932; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10933; GFX7-NEXT:    v_mov_b32_e32 v4, v3
10934; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
10935; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
10936; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
10937; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
10938; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
10939; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
10940; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10941; GFX7-NEXT:    buffer_wbinvl1
10942; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
10943; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10944; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10945; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
10946; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
10947; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
10948; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
10949; GFX7-NEXT:    s_setpc_b64 s[30:31]
10950  %gep = getelementptr bfloat, ptr %ptr, i64 1023
10951  %result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4
10952  ret bfloat %result
10953}
10954
10955define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, bfloat %val) #0 {
10956; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
10957; GFX12:       ; %bb.0:
10958; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10959; GFX12-NEXT:    s_wait_expcnt 0x0
10960; GFX12-NEXT:    s_wait_samplecnt 0x0
10961; GFX12-NEXT:    s_wait_bvhcnt 0x0
10962; GFX12-NEXT:    s_wait_kmcnt 0x0
10963; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
10964; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
10965; GFX12-NEXT:    s_mov_b32 s0, 0
10966; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
10967; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
10968; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10969; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
10970; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10971; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v4
10972; GFX12-NEXT:    v_bfe_u32 v5, v2, 16, 1
10973; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v2
10974; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10975; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
10976; GFX12-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
10977; GFX12-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
10978; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10979; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10980; GFX12-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
10981; GFX12-NEXT:    s_wait_storecnt 0x0
10982; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
10983; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10984; GFX12-NEXT:    global_inv scope:SCOPE_DEV
10985; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
10986; GFX12-NEXT:    v_mov_b32_e32 v3, v2
10987; GFX12-NEXT:    s_wait_alu 0xfffe
10988; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
10989; GFX12-NEXT:    s_wait_alu 0xfffe
10990; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10991; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
10992; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
10993; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10994; GFX12-NEXT:    s_wait_alu 0xfffe
10995; GFX12-NEXT:    s_setpc_b64 s[30:31]
10996;
10997; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
10998; GFX940:       ; %bb.0:
10999; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11000; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
11001; GFX940-NEXT:    s_mov_b64 s[0:1], 0
11002; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11003; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
11004; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
11005; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
11006; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
11007; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11008; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11009; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v4
11010; GFX940-NEXT:    v_bfe_u32 v5, v2, 16, 1
11011; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v2
11012; GFX940-NEXT:    v_add3_u32 v5, v5, v2, s2
11013; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11014; GFX940-NEXT:    s_nop 1
11015; GFX940-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
11016; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11017; GFX940-NEXT:    v_and_or_b32 v2, v3, s3, v2
11018; GFX940-NEXT:    buffer_wbl2 sc1
11019; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
11020; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11021; GFX940-NEXT:    buffer_inv sc1
11022; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11023; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
11024; GFX940-NEXT:    v_mov_b32_e32 v3, v2
11025; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
11026; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
11027; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
11028; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
11029; GFX940-NEXT:    s_setpc_b64 s[30:31]
11030;
11031; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
11032; GFX11:       ; %bb.0:
11033; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11034; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
11035; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11036; GFX11-NEXT:    s_mov_b32 s0, 0
11037; GFX11-NEXT:    .p2align 6
11038; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
11039; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
11040; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11041; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11042; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11043; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v4
11044; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
11045; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
11046; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
11047; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
11048; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
11049; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
11050; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11051; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11052; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
11053; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
11054; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
11055; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11056; GFX11-NEXT:    buffer_gl1_inv
11057; GFX11-NEXT:    buffer_gl0_inv
11058; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
11059; GFX11-NEXT:    v_mov_b32_e32 v3, v2
11060; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
11061; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11062; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11063; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
11064; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
11065; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11066; GFX11-NEXT:    s_setpc_b64 s[30:31]
11067;
11068; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
11069; GFX10:       ; %bb.0:
11070; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11071; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
11072; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11073; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11074; GFX10-NEXT:    s_mov_b32 s4, 0
11075; GFX10-NEXT:    flat_load_dword v3, v[0:1]
11076; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
11077; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
11078; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11079; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11080; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v4
11081; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
11082; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
11083; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
11084; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
11085; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
11086; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11087; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
11088; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
11089; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11090; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11091; GFX10-NEXT:    buffer_gl1_inv
11092; GFX10-NEXT:    buffer_gl0_inv
11093; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
11094; GFX10-NEXT:    v_mov_b32_e32 v3, v2
11095; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
11096; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
11097; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
11098; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
11099; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
11100; GFX10-NEXT:    s_setpc_b64 s[30:31]
11101;
11102; GFX90A-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
11103; GFX90A:       ; %bb.0:
11104; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11105; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
11106; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
11107; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11108; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
11109; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
11110; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
11111; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
11112; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11113; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11114; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v4
11115; GFX90A-NEXT:    v_bfe_u32 v5, v2, 16, 1
11116; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v2
11117; GFX90A-NEXT:    v_add3_u32 v5, v5, v2, s6
11118; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11119; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
11120; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11121; GFX90A-NEXT:    v_and_or_b32 v2, v3, s7, v2
11122; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
11123; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11124; GFX90A-NEXT:    buffer_wbinvl1
11125; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11126; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11127; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
11128; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11129; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
11130; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
11131; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
11132; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11133;
11134; GFX908-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
11135; GFX908:       ; %bb.0:
11136; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11137; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
11138; GFX908-NEXT:    s_mov_b64 s[4:5], 0
11139; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11140; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
11141; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
11142; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
11143; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
11144; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11145; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11146; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v4
11147; GFX908-NEXT:    v_bfe_u32 v5, v2, 16, 1
11148; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v2
11149; GFX908-NEXT:    v_add3_u32 v5, v5, v2, s6
11150; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11151; GFX908-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
11152; GFX908-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11153; GFX908-NEXT:    v_and_or_b32 v2, v3, s7, v2
11154; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
11155; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11156; GFX908-NEXT:    buffer_wbinvl1
11157; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11158; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11159; GFX908-NEXT:    v_mov_b32_e32 v3, v2
11160; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11161; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
11162; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
11163; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
11164; GFX908-NEXT:    s_setpc_b64 s[30:31]
11165;
11166; GFX8-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
11167; GFX8:       ; %bb.0:
11168; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11169; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
11170; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11171; GFX8-NEXT:    flat_load_dword v3, v[0:1]
11172; GFX8-NEXT:    s_mov_b64 s[4:5], 0
11173; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11174; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
11175; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
11176; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11177; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11178; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v4
11179; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
11180; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
11181; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
11182; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
11183; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11184; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
11185; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
11186; GFX8-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11187; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11188; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11189; GFX8-NEXT:    buffer_wbinvl1
11190; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11191; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11192; GFX8-NEXT:    v_mov_b32_e32 v3, v2
11193; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11194; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
11195; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
11196; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
11197; GFX8-NEXT:    s_setpc_b64 s[30:31]
11198;
11199; GFX7-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
11200; GFX7:       ; %bb.0:
11201; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11202; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
11203; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11204; GFX7-NEXT:    flat_load_dword v3, v[0:1]
11205; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
11206; GFX7-NEXT:    s_mov_b64 s[4:5], 0
11207; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
11208; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
11209; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
11210; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11211; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11212; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v4
11213; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
11214; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11215; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
11216; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11217; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11218; GFX7-NEXT:    buffer_wbinvl1
11219; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11220; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11221; GFX7-NEXT:    v_mov_b32_e32 v3, v2
11222; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11223; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
11224; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
11225; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
11226; GFX7-NEXT:    s_setpc_b64 s[30:31]
11227  %gep = getelementptr bfloat, ptr %ptr, i64 1023
11228  %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4
11229  ret void
11230}
11231
11232define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
11233; GFX12-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
11234; GFX12:       ; %bb.0:
11235; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11236; GFX12-NEXT:    s_wait_expcnt 0x0
11237; GFX12-NEXT:    s_wait_samplecnt 0x0
11238; GFX12-NEXT:    s_wait_bvhcnt 0x0
11239; GFX12-NEXT:    s_wait_kmcnt 0x0
11240; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
11241; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11242; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11243; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
11244; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
11245; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
11246; GFX12-NEXT:    s_mov_b32 s0, 0
11247; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
11248; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11249; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
11250; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11251; GFX12-NEXT:    v_not_b32_e32 v4, v4
11252; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
11253; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
11254; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11255; GFX12-NEXT:    v_mov_b32_e32 v6, v5
11256; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11257; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11258; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11259; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11260; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
11261; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
11262; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11263; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11264; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
11265; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11266; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11267; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11268; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11269; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11270; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11271; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
11272; GFX12-NEXT:    global_wb scope:SCOPE_SYS
11273; GFX12-NEXT:    s_wait_storecnt 0x0
11274; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
11275; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11276; GFX12-NEXT:    global_inv scope:SCOPE_SYS
11277; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11278; GFX12-NEXT:    s_wait_alu 0xfffe
11279; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
11280; GFX12-NEXT:    s_wait_alu 0xfffe
11281; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11282; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
11283; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
11284; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11285; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11286; GFX12-NEXT:    s_wait_alu 0xfffe
11287; GFX12-NEXT:    s_setpc_b64 s[30:31]
11288;
11289; GFX940-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
11290; GFX940:       ; %bb.0:
11291; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11292; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
11293; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
11294; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
11295; GFX940-NEXT:    v_mov_b32_e32 v1, v5
11296; GFX940-NEXT:    flat_load_dword v5, v[0:1]
11297; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
11298; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11299; GFX940-NEXT:    s_mov_b32 s0, 0xffff
11300; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
11301; GFX940-NEXT:    v_not_b32_e32 v4, v4
11302; GFX940-NEXT:    s_mov_b64 s[0:1], 0
11303; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11304; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
11305; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
11306; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
11307; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11308; GFX940-NEXT:    v_mov_b32_e32 v7, v5
11309; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11310; GFX940-NEXT:    s_nop 0
11311; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
11312; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
11313; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11314; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
11315; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11316; GFX940-NEXT:    s_nop 1
11317; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
11318; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11319; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
11320; GFX940-NEXT:    buffer_wbl2 sc0 sc1
11321; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
11322; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11323; GFX940-NEXT:    buffer_inv sc0 sc1
11324; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
11325; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
11326; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
11327; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
11328; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
11329; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
11330; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11331; GFX940-NEXT:    s_setpc_b64 s[30:31]
11332;
11333; GFX11-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
11334; GFX11:       ; %bb.0:
11335; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11336; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
11337; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11338; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11339; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
11340; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
11341; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
11342; GFX11-NEXT:    s_mov_b32 s0, 0
11343; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
11344; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11345; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
11346; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11347; GFX11-NEXT:    v_not_b32_e32 v4, v4
11348; GFX11-NEXT:    .p2align 6
11349; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
11350; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
11351; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11352; GFX11-NEXT:    v_mov_b32_e32 v6, v5
11353; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11354; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11355; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11356; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11357; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
11358; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
11359; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11360; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11361; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
11362; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11363; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11364; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11365; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11366; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11367; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11368; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
11369; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
11370; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
11371; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11372; GFX11-NEXT:    buffer_gl1_inv
11373; GFX11-NEXT:    buffer_gl0_inv
11374; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11375; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
11376; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11377; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11378; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
11379; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
11380; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11381; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11382; GFX11-NEXT:    s_setpc_b64 s[30:31]
11383;
11384; GFX10-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
11385; GFX10:       ; %bb.0:
11386; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11387; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
11388; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11389; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11390; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
11391; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
11392; GFX10-NEXT:    s_mov_b32 s4, 0
11393; GFX10-NEXT:    flat_load_dword v5, v[0:1]
11394; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11395; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
11396; GFX10-NEXT:    v_not_b32_e32 v4, v4
11397; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
11398; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
11399; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11400; GFX10-NEXT:    v_mov_b32_e32 v6, v5
11401; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11402; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
11403; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
11404; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11405; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11406; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11407; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11408; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11409; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
11410; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
11411; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11412; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11413; GFX10-NEXT:    buffer_gl1_inv
11414; GFX10-NEXT:    buffer_gl0_inv
11415; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11416; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
11417; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
11418; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
11419; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
11420; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
11421; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11422; GFX10-NEXT:    s_setpc_b64 s[30:31]
11423;
11424; GFX90A-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
11425; GFX90A:       ; %bb.0:
11426; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11427; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
11428; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
11429; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
11430; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
11431; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
11432; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11433; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
11434; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11435; GFX90A-NEXT:    v_not_b32_e32 v4, v4
11436; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
11437; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11438; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
11439; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
11440; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
11441; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11442; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
11443; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11444; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
11445; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
11446; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11447; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
11448; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11449; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
11450; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11451; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
11452; GFX90A-NEXT:    buffer_wbl2
11453; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
11454; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11455; GFX90A-NEXT:    buffer_invl2
11456; GFX90A-NEXT:    buffer_wbinvl1
11457; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
11458; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11459; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11460; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
11461; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
11462; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
11463; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11464; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11465;
11466; GFX908-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
11467; GFX908:       ; %bb.0:
11468; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11469; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
11470; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
11471; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
11472; GFX908-NEXT:    flat_load_dword v5, v[0:1]
11473; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
11474; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11475; GFX908-NEXT:    s_mov_b32 s4, 0xffff
11476; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11477; GFX908-NEXT:    v_not_b32_e32 v4, v4
11478; GFX908-NEXT:    s_mov_b64 s[4:5], 0
11479; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11480; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
11481; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
11482; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
11483; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11484; GFX908-NEXT:    v_mov_b32_e32 v6, v5
11485; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11486; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
11487; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
11488; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11489; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
11490; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11491; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
11492; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11493; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
11494; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11495; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11496; GFX908-NEXT:    buffer_wbinvl1
11497; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11498; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11499; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11500; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
11501; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
11502; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
11503; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11504; GFX908-NEXT:    s_setpc_b64 s[30:31]
11505;
11506; GFX8-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
11507; GFX8:       ; %bb.0:
11508; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11509; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
11510; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11511; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
11512; GFX8-NEXT:    flat_load_dword v5, v[0:1]
11513; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
11514; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11515; GFX8-NEXT:    s_mov_b32 s4, 0xffff
11516; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11517; GFX8-NEXT:    v_not_b32_e32 v4, v4
11518; GFX8-NEXT:    s_mov_b64 s[4:5], 0
11519; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11520; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
11521; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
11522; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11523; GFX8-NEXT:    v_mov_b32_e32 v6, v5
11524; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11525; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
11526; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
11527; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
11528; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
11529; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11530; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11531; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
11532; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
11533; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11534; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
11535; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11536; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11537; GFX8-NEXT:    buffer_wbinvl1
11538; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11539; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11540; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11541; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
11542; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
11543; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
11544; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11545; GFX8-NEXT:    s_setpc_b64 s[30:31]
11546;
11547; GFX7-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
11548; GFX7:       ; %bb.0:
11549; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11550; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
11551; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11552; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
11553; GFX7-NEXT:    flat_load_dword v5, v[0:1]
11554; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
11555; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11556; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
11557; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
11558; GFX7-NEXT:    v_not_b32_e32 v4, v4
11559; GFX7-NEXT:    s_mov_b64 s[4:5], 0
11560; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
11561; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
11562; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
11563; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11564; GFX7-NEXT:    v_mov_b32_e32 v6, v5
11565; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11566; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11567; GFX7-NEXT:    v_sub_f32_e32 v5, v5, v2
11568; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11569; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
11570; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11571; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
11572; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11573; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11574; GFX7-NEXT:    buffer_wbinvl1
11575; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11576; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11577; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11578; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
11579; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
11580; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
11581; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11582; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
11583; GFX7-NEXT:    s_setpc_b64 s[30:31]
11584  %gep = getelementptr bfloat, ptr %ptr, i64 1023
11585  %result = atomicrmw fsub ptr %gep, bfloat %val seq_cst
11586  ret bfloat %result
11587}
11588
11589define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
11590; GFX12-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
11591; GFX12:       ; %bb.0:
11592; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11593; GFX12-NEXT:    s_wait_expcnt 0x0
11594; GFX12-NEXT:    s_wait_samplecnt 0x0
11595; GFX12-NEXT:    s_wait_bvhcnt 0x0
11596; GFX12-NEXT:    s_wait_kmcnt 0x0
11597; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
11598; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11599; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
11600; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
11601; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
11602; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
11603; GFX12-NEXT:    s_mov_b32 s0, 0
11604; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
11605; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
11606; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
11607; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11608; GFX12-NEXT:    v_not_b32_e32 v5, v5
11609; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
11610; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
11611; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11612; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
11613; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11614; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11615; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v6
11616; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
11617; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
11618; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
11619; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
11620; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
11621; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11622; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
11623; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11624; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11625; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
11626; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
11627; GFX12-NEXT:    global_wb scope:SCOPE_SYS
11628; GFX12-NEXT:    s_wait_storecnt 0x0
11629; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
11630; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11631; GFX12-NEXT:    global_inv scope:SCOPE_SYS
11632; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
11633; GFX12-NEXT:    v_mov_b32_e32 v3, v2
11634; GFX12-NEXT:    s_wait_alu 0xfffe
11635; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
11636; GFX12-NEXT:    s_wait_alu 0xfffe
11637; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11638; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
11639; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
11640; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11641; GFX12-NEXT:    s_wait_alu 0xfffe
11642; GFX12-NEXT:    s_setpc_b64 s[30:31]
11643;
11644; GFX940-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
11645; GFX940:       ; %bb.0:
11646; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11647; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
11648; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
11649; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
11650; GFX940-NEXT:    v_mov_b32_e32 v1, v5
11651; GFX940-NEXT:    flat_load_dword v3, v[0:1]
11652; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
11653; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
11654; GFX940-NEXT:    s_mov_b32 s0, 0xffff
11655; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
11656; GFX940-NEXT:    v_not_b32_e32 v5, v5
11657; GFX940-NEXT:    s_mov_b64 s[0:1], 0
11658; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
11659; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
11660; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
11661; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
11662; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11663; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11664; GFX940-NEXT:    s_nop 0
11665; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v6
11666; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
11667; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
11668; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
11669; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11670; GFX940-NEXT:    s_nop 1
11671; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
11672; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11673; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
11674; GFX940-NEXT:    buffer_wbl2 sc0 sc1
11675; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
11676; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11677; GFX940-NEXT:    buffer_inv sc0 sc1
11678; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11679; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
11680; GFX940-NEXT:    v_mov_b32_e32 v3, v2
11681; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
11682; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
11683; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
11684; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
11685; GFX940-NEXT:    s_setpc_b64 s[30:31]
11686;
11687; GFX11-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
11688; GFX11:       ; %bb.0:
11689; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11690; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
11691; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11692; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
11693; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
11694; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
11695; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
11696; GFX11-NEXT:    s_mov_b32 s0, 0
11697; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
11698; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
11699; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
11700; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11701; GFX11-NEXT:    v_not_b32_e32 v5, v5
11702; GFX11-NEXT:    .p2align 6
11703; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
11704; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
11705; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11706; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
11707; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11708; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11709; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v6
11710; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
11711; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
11712; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
11713; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
11714; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
11715; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11716; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
11717; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11718; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11719; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
11720; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
11721; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
11722; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
11723; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11724; GFX11-NEXT:    buffer_gl1_inv
11725; GFX11-NEXT:    buffer_gl0_inv
11726; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
11727; GFX11-NEXT:    v_mov_b32_e32 v3, v2
11728; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
11729; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11730; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11731; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
11732; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
11733; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11734; GFX11-NEXT:    s_setpc_b64 s[30:31]
11735;
11736; GFX10-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
11737; GFX10:       ; %bb.0:
11738; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11739; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
11740; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11741; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
11742; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
11743; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
11744; GFX10-NEXT:    s_mov_b32 s4, 0
11745; GFX10-NEXT:    flat_load_dword v3, v[0:1]
11746; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
11747; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
11748; GFX10-NEXT:    v_not_b32_e32 v5, v5
11749; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
11750; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
11751; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11752; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11753; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v6
11754; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
11755; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
11756; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
11757; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
11758; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
11759; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11760; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
11761; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
11762; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11763; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11764; GFX10-NEXT:    buffer_gl1_inv
11765; GFX10-NEXT:    buffer_gl0_inv
11766; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
11767; GFX10-NEXT:    v_mov_b32_e32 v3, v2
11768; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
11769; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
11770; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
11771; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
11772; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
11773; GFX10-NEXT:    s_setpc_b64 s[30:31]
11774;
11775; GFX90A-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
11776; GFX90A:       ; %bb.0:
11777; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11778; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
11779; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
11780; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
11781; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
11782; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
11783; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
11784; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
11785; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
11786; GFX90A-NEXT:    v_not_b32_e32 v5, v5
11787; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
11788; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
11789; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
11790; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
11791; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
11792; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11793; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11794; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v6
11795; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
11796; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
11797; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
11798; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11799; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
11800; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11801; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
11802; GFX90A-NEXT:    buffer_wbl2
11803; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11804; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11805; GFX90A-NEXT:    buffer_invl2
11806; GFX90A-NEXT:    buffer_wbinvl1
11807; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11808; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11809; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
11810; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11811; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
11812; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
11813; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
11814; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11815;
11816; GFX908-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
11817; GFX908:       ; %bb.0:
11818; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11819; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
11820; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
11821; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
11822; GFX908-NEXT:    flat_load_dword v3, v[0:1]
11823; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
11824; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
11825; GFX908-NEXT:    s_mov_b32 s4, 0xffff
11826; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
11827; GFX908-NEXT:    v_not_b32_e32 v5, v5
11828; GFX908-NEXT:    s_mov_b64 s[4:5], 0
11829; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
11830; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
11831; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
11832; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
11833; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11834; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11835; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v6
11836; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
11837; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
11838; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
11839; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11840; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
11841; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11842; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
11843; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11844; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11845; GFX908-NEXT:    buffer_wbinvl1
11846; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11847; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11848; GFX908-NEXT:    v_mov_b32_e32 v3, v2
11849; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11850; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
11851; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
11852; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
11853; GFX908-NEXT:    s_setpc_b64 s[30:31]
11854;
11855; GFX8-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
11856; GFX8:       ; %bb.0:
11857; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11858; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
11859; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11860; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
11861; GFX8-NEXT:    flat_load_dword v3, v[0:1]
11862; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
11863; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
11864; GFX8-NEXT:    s_mov_b32 s4, 0xffff
11865; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
11866; GFX8-NEXT:    v_not_b32_e32 v5, v5
11867; GFX8-NEXT:    s_mov_b64 s[4:5], 0
11868; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
11869; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
11870; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
11871; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11872; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11873; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v6
11874; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
11875; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
11876; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
11877; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
11878; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11879; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
11880; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
11881; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11882; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
11883; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11884; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11885; GFX8-NEXT:    buffer_wbinvl1
11886; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11887; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11888; GFX8-NEXT:    v_mov_b32_e32 v3, v2
11889; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11890; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
11891; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
11892; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
11893; GFX8-NEXT:    s_setpc_b64 s[30:31]
11894;
11895; GFX7-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
11896; GFX7:       ; %bb.0:
11897; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11898; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
11899; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11900; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
11901; GFX7-NEXT:    flat_load_dword v3, v[0:1]
11902; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
11903; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
11904; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
11905; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
11906; GFX7-NEXT:    v_not_b32_e32 v5, v5
11907; GFX7-NEXT:    s_mov_b64 s[4:5], 0
11908; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
11909; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
11910; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
11911; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11912; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
11913; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11914; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
11915; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11916; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
11917; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
11918; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
11919; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11920; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11921; GFX7-NEXT:    buffer_wbinvl1
11922; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11923; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11924; GFX7-NEXT:    v_mov_b32_e32 v3, v2
11925; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11926; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
11927; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
11928; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
11929; GFX7-NEXT:    s_setpc_b64 s[30:31]
11930  %gep = getelementptr bfloat, ptr %ptr, i64 1023
11931  %unused = atomicrmw fsub ptr %gep, bfloat %val seq_cst
11932  ret void
11933}
11934
11935; --------------------------------------------------------------------
11936; <2 x half>
11937; --------------------------------------------------------------------
11938
11939define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) #0 {
11940; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2f16:
11941; GFX12:       ; %bb.0:
11942; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11943; GFX12-NEXT:    s_wait_expcnt 0x0
11944; GFX12-NEXT:    s_wait_samplecnt 0x0
11945; GFX12-NEXT:    s_wait_bvhcnt 0x0
11946; GFX12-NEXT:    s_wait_kmcnt 0x0
11947; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
11948; GFX12-NEXT:    s_mov_b32 s0, 0
11949; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
11950; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
11951; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11952; GFX12-NEXT:    v_mov_b32_e32 v4, v3
11953; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11954; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
11955; GFX12-NEXT:    s_wait_storecnt 0x0
11956; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
11957; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11958; GFX12-NEXT:    global_inv scope:SCOPE_DEV
11959; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
11960; GFX12-NEXT:    s_wait_alu 0xfffe
11961; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
11962; GFX12-NEXT:    s_wait_alu 0xfffe
11963; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11964; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
11965; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
11966; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11967; GFX12-NEXT:    v_mov_b32_e32 v0, v3
11968; GFX12-NEXT:    s_wait_alu 0xfffe
11969; GFX12-NEXT:    s_setpc_b64 s[30:31]
11970;
11971; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16:
11972; GFX940:       ; %bb.0:
11973; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11974; GFX940-NEXT:    flat_load_dword v3, v[0:1]
11975; GFX940-NEXT:    s_mov_b64 s[0:1], 0
11976; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
11977; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
11978; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11979; GFX940-NEXT:    v_mov_b32_e32 v5, v3
11980; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
11981; GFX940-NEXT:    buffer_wbl2 sc1
11982; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
11983; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11984; GFX940-NEXT:    buffer_inv sc1
11985; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
11986; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
11987; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
11988; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
11989; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
11990; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
11991; GFX940-NEXT:    v_mov_b32_e32 v0, v3
11992; GFX940-NEXT:    s_setpc_b64 s[30:31]
11993;
11994; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2f16:
11995; GFX11:       ; %bb.0:
11996; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11997; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
11998; GFX11-NEXT:    s_mov_b32 s0, 0
11999; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
12000; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
12001; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12002; GFX11-NEXT:    v_mov_b32_e32 v4, v3
12003; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12004; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12005; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
12006; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
12007; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12008; GFX11-NEXT:    buffer_gl1_inv
12009; GFX11-NEXT:    buffer_gl0_inv
12010; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12011; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
12012; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12013; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12014; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
12015; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
12016; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12017; GFX11-NEXT:    v_mov_b32_e32 v0, v3
12018; GFX11-NEXT:    s_setpc_b64 s[30:31]
12019;
12020; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2f16:
12021; GFX10:       ; %bb.0:
12022; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12023; GFX10-NEXT:    flat_load_dword v3, v[0:1]
12024; GFX10-NEXT:    s_mov_b32 s4, 0
12025; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
12026; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
12027; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12028; GFX10-NEXT:    v_mov_b32_e32 v4, v3
12029; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12030; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
12031; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
12032; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12033; GFX10-NEXT:    buffer_gl1_inv
12034; GFX10-NEXT:    buffer_gl0_inv
12035; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12036; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
12037; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
12038; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
12039; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
12040; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
12041; GFX10-NEXT:    v_mov_b32_e32 v0, v3
12042; GFX10-NEXT:    s_setpc_b64 s[30:31]
12043;
12044; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2f16:
12045; GFX90A:       ; %bb.0:
12046; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12047; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
12048; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
12049; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
12050; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
12051; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12052; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
12053; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
12054; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
12055; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12056; GFX90A-NEXT:    buffer_wbinvl1
12057; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
12058; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12059; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12060; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
12061; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
12062; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
12063; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
12064; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12065;
12066; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2f16:
12067; GFX908:       ; %bb.0:
12068; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12069; GFX908-NEXT:    flat_load_dword v3, v[0:1]
12070; GFX908-NEXT:    s_mov_b64 s[4:5], 0
12071; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
12072; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
12073; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12074; GFX908-NEXT:    v_mov_b32_e32 v4, v3
12075; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12076; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
12077; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12078; GFX908-NEXT:    buffer_wbinvl1
12079; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
12080; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12081; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12082; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
12083; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
12084; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
12085; GFX908-NEXT:    v_mov_b32_e32 v0, v3
12086; GFX908-NEXT:    s_setpc_b64 s[30:31]
12087;
12088; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2f16:
12089; GFX8:       ; %bb.0:
12090; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12091; GFX8-NEXT:    flat_load_dword v3, v[0:1]
12092; GFX8-NEXT:    s_mov_b64 s[4:5], 0
12093; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
12094; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
12095; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12096; GFX8-NEXT:    v_mov_b32_e32 v4, v3
12097; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
12098; GFX8-NEXT:    v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12099; GFX8-NEXT:    v_sub_f16_e32 v5, v4, v2
12100; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
12101; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
12102; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
12103; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12104; GFX8-NEXT:    buffer_wbinvl1
12105; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
12106; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12107; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12108; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
12109; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
12110; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
12111; GFX8-NEXT:    v_mov_b32_e32 v0, v3
12112; GFX8-NEXT:    s_setpc_b64 s[30:31]
12113;
12114; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2f16:
12115; GFX7:       ; %bb.0:
12116; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12117; GFX7-NEXT:    flat_load_dword v5, v[0:1]
12118; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
12119; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
12120; GFX7-NEXT:    s_mov_b64 s[4:5], 0
12121; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
12122; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12123; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
12124; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
12125; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
12126; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
12127; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
12128; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
12129; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
12130; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
12131; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
12132; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
12133; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
12134; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v4
12135; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v5
12136; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
12137; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
12138; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
12139; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
12140; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
12141; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[6:7] glc
12142; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12143; GFX7-NEXT:    buffer_wbinvl1
12144; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
12145; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v6
12146; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
12147; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
12148; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12149; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12150; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
12151; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
12152; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
12153; GFX7-NEXT:    v_mov_b32_e32 v0, v2
12154; GFX7-NEXT:    v_mov_b32_e32 v1, v3
12155; GFX7-NEXT:    s_setpc_b64 s[30:31]
12156  %result = atomicrmw fsub ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
12157  ret <2 x half> %result
12158}
12159
12160define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
12161; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos:
12162; GFX12:       ; %bb.0:
12163; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12164; GFX12-NEXT:    s_wait_expcnt 0x0
12165; GFX12-NEXT:    s_wait_samplecnt 0x0
12166; GFX12-NEXT:    s_wait_bvhcnt 0x0
12167; GFX12-NEXT:    s_wait_kmcnt 0x0
12168; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
12169; GFX12-NEXT:    s_mov_b32 s0, 0
12170; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
12171; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
12172; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12173; GFX12-NEXT:    v_mov_b32_e32 v4, v3
12174; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12175; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12176; GFX12-NEXT:    s_wait_storecnt 0x0
12177; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12178; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12179; GFX12-NEXT:    global_inv scope:SCOPE_DEV
12180; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12181; GFX12-NEXT:    s_wait_alu 0xfffe
12182; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
12183; GFX12-NEXT:    s_wait_alu 0xfffe
12184; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12185; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
12186; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
12187; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12188; GFX12-NEXT:    v_mov_b32_e32 v0, v3
12189; GFX12-NEXT:    s_wait_alu 0xfffe
12190; GFX12-NEXT:    s_setpc_b64 s[30:31]
12191;
12192; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos:
12193; GFX940:       ; %bb.0:
12194; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12195; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
12196; GFX940-NEXT:    s_mov_b64 s[0:1], 0
12197; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
12198; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
12199; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12200; GFX940-NEXT:    v_mov_b32_e32 v5, v3
12201; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
12202; GFX940-NEXT:    buffer_wbl2 sc1
12203; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
12204; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12205; GFX940-NEXT:    buffer_inv sc1
12206; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
12207; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
12208; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
12209; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
12210; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
12211; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
12212; GFX940-NEXT:    v_mov_b32_e32 v0, v3
12213; GFX940-NEXT:    s_setpc_b64 s[30:31]
12214;
12215; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos:
12216; GFX11:       ; %bb.0:
12217; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12218; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
12219; GFX11-NEXT:    s_mov_b32 s0, 0
12220; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
12221; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
12222; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12223; GFX11-NEXT:    v_mov_b32_e32 v4, v3
12224; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12225; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12226; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
12227; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
12228; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12229; GFX11-NEXT:    buffer_gl1_inv
12230; GFX11-NEXT:    buffer_gl0_inv
12231; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12232; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
12233; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12234; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12235; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
12236; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
12237; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12238; GFX11-NEXT:    v_mov_b32_e32 v0, v3
12239; GFX11-NEXT:    s_setpc_b64 s[30:31]
12240;
12241; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos:
12242; GFX10:       ; %bb.0:
12243; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12244; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
12245; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
12246; GFX10-NEXT:    s_mov_b32 s4, 0
12247; GFX10-NEXT:    flat_load_dword v0, v[3:4]
12248; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
12249; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
12250; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12251; GFX10-NEXT:    v_mov_b32_e32 v1, v0
12252; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
12253; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
12254; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
12255; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12256; GFX10-NEXT:    buffer_gl1_inv
12257; GFX10-NEXT:    buffer_gl0_inv
12258; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
12259; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
12260; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
12261; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
12262; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
12263; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
12264; GFX10-NEXT:    s_setpc_b64 s[30:31]
12265;
12266; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos:
12267; GFX90A:       ; %bb.0:
12268; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12269; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
12270; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
12271; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
12272; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
12273; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12274; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
12275; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
12276; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
12277; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12278; GFX90A-NEXT:    buffer_wbinvl1
12279; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
12280; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12281; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12282; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
12283; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
12284; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
12285; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
12286; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12287;
12288; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos:
12289; GFX908:       ; %bb.0:
12290; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12291; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
12292; GFX908-NEXT:    s_mov_b64 s[4:5], 0
12293; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
12294; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
12295; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12296; GFX908-NEXT:    v_mov_b32_e32 v4, v3
12297; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12298; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
12299; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12300; GFX908-NEXT:    buffer_wbinvl1
12301; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
12302; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12303; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12304; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
12305; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
12306; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
12307; GFX908-NEXT:    v_mov_b32_e32 v0, v3
12308; GFX908-NEXT:    s_setpc_b64 s[30:31]
12309;
12310; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos:
12311; GFX8:       ; %bb.0:
12312; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12313; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
12314; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
12315; GFX8-NEXT:    flat_load_dword v0, v[3:4]
12316; GFX8-NEXT:    s_mov_b64 s[4:5], 0
12317; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
12318; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
12319; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12320; GFX8-NEXT:    v_mov_b32_e32 v1, v0
12321; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
12322; GFX8-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12323; GFX8-NEXT:    v_sub_f16_e32 v5, v1, v2
12324; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12325; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
12326; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
12327; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12328; GFX8-NEXT:    buffer_wbinvl1
12329; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
12330; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12331; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12332; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
12333; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
12334; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
12335; GFX8-NEXT:    s_setpc_b64 s[30:31]
12336;
12337; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos:
12338; GFX7:       ; %bb.0:
12339; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12340; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
12341; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
12342; GFX7-NEXT:    flat_load_dword v1, v[4:5]
12343; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
12344; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
12345; GFX7-NEXT:    s_mov_b64 s[4:5], 0
12346; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
12347; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
12348; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12349; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
12350; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
12351; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
12352; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
12353; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
12354; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
12355; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
12356; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
12357; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
12358; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
12359; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
12360; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
12361; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
12362; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
12363; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
12364; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
12365; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
12366; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
12367; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12368; GFX7-NEXT:    buffer_wbinvl1
12369; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
12370; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
12371; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
12372; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
12373; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12374; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12375; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
12376; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
12377; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
12378; GFX7-NEXT:    s_setpc_b64 s[30:31]
12379  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
12380  %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst
12381  ret <2 x half> %result
12382}
12383
12384define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 x half> %val) #0 {
12385; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
12386; GFX12:       ; %bb.0:
12387; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12388; GFX12-NEXT:    s_wait_expcnt 0x0
12389; GFX12-NEXT:    s_wait_samplecnt 0x0
12390; GFX12-NEXT:    s_wait_bvhcnt 0x0
12391; GFX12-NEXT:    s_wait_kmcnt 0x0
12392; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
12393; GFX12-NEXT:    s_mov_b32 s0, 0
12394; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
12395; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
12396; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12397; GFX12-NEXT:    v_mov_b32_e32 v4, v3
12398; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12399; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12400; GFX12-NEXT:    s_wait_storecnt 0x0
12401; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12402; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12403; GFX12-NEXT:    global_inv scope:SCOPE_DEV
12404; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12405; GFX12-NEXT:    s_wait_alu 0xfffe
12406; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
12407; GFX12-NEXT:    s_wait_alu 0xfffe
12408; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12409; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
12410; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
12411; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12412; GFX12-NEXT:    v_mov_b32_e32 v0, v3
12413; GFX12-NEXT:    s_wait_alu 0xfffe
12414; GFX12-NEXT:    s_setpc_b64 s[30:31]
12415;
12416; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
12417; GFX940:       ; %bb.0:
12418; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12419; GFX940-NEXT:    v_mov_b32_e32 v4, v0
12420; GFX940-NEXT:    v_mov_b32_e32 v5, v1
12421; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
12422; GFX940-NEXT:    s_movk_i32 s0, 0xf800
12423; GFX940-NEXT:    s_nop 0
12424; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
12425; GFX940-NEXT:    flat_load_dword v0, v[0:1]
12426; GFX940-NEXT:    s_mov_b32 s1, -1
12427; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
12428; GFX940-NEXT:    s_mov_b64 s[0:1], 0
12429; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
12430; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
12431; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12432; GFX940-NEXT:    v_mov_b32_e32 v1, v0
12433; GFX940-NEXT:    v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
12434; GFX940-NEXT:    buffer_wbl2 sc1
12435; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
12436; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12437; GFX940-NEXT:    buffer_inv sc1
12438; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
12439; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
12440; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
12441; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
12442; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
12443; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
12444; GFX940-NEXT:    s_setpc_b64 s[30:31]
12445;
12446; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
12447; GFX11:       ; %bb.0:
12448; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12449; GFX11-NEXT:    v_mov_b32_e32 v3, v0
12450; GFX11-NEXT:    s_mov_b32 s0, 0
12451; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12452; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
12453; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
12454; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
12455; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
12456; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
12457; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
12458; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
12459; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12460; GFX11-NEXT:    v_mov_b32_e32 v1, v0
12461; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12462; GFX11-NEXT:    v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
12463; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
12464; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
12465; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12466; GFX11-NEXT:    buffer_gl1_inv
12467; GFX11-NEXT:    buffer_gl0_inv
12468; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
12469; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
12470; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12471; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12472; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
12473; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
12474; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12475; GFX11-NEXT:    s_setpc_b64 s[30:31]
12476;
12477; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
12478; GFX10:       ; %bb.0:
12479; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12480; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
12481; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
12482; GFX10-NEXT:    s_mov_b32 s4, 0
12483; GFX10-NEXT:    flat_load_dword v0, v[3:4]
12484; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
12485; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
12486; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12487; GFX10-NEXT:    v_mov_b32_e32 v1, v0
12488; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
12489; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
12490; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
12491; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12492; GFX10-NEXT:    buffer_gl1_inv
12493; GFX10-NEXT:    buffer_gl0_inv
12494; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
12495; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
12496; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
12497; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
12498; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
12499; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
12500; GFX10-NEXT:    s_setpc_b64 s[30:31]
12501;
12502; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
12503; GFX90A:       ; %bb.0:
12504; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12505; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
12506; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
12507; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
12508; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
12509; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
12510; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
12511; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
12512; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
12513; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12514; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
12515; GFX90A-NEXT:    v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
12516; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
12517; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12518; GFX90A-NEXT:    buffer_wbinvl1
12519; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
12520; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12521; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12522; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
12523; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
12524; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
12525; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12526;
12527; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
12528; GFX908:       ; %bb.0:
12529; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12530; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
12531; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
12532; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
12533; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
12534; GFX908-NEXT:    flat_load_dword v0, v[0:1]
12535; GFX908-NEXT:    s_mov_b64 s[4:5], 0
12536; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
12537; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
12538; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12539; GFX908-NEXT:    v_mov_b32_e32 v1, v0
12540; GFX908-NEXT:    v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
12541; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
12542; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12543; GFX908-NEXT:    buffer_wbinvl1
12544; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
12545; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12546; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12547; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
12548; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
12549; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
12550; GFX908-NEXT:    s_setpc_b64 s[30:31]
12551;
12552; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
12553; GFX8:       ; %bb.0:
12554; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12555; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
12556; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
12557; GFX8-NEXT:    flat_load_dword v0, v[3:4]
12558; GFX8-NEXT:    s_mov_b64 s[4:5], 0
12559; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
12560; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
12561; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12562; GFX8-NEXT:    v_mov_b32_e32 v1, v0
12563; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
12564; GFX8-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12565; GFX8-NEXT:    v_sub_f16_e32 v5, v1, v2
12566; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
12567; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
12568; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
12569; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12570; GFX8-NEXT:    buffer_wbinvl1
12571; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
12572; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12573; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12574; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
12575; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
12576; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
12577; GFX8-NEXT:    s_setpc_b64 s[30:31]
12578;
12579; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
12580; GFX7:       ; %bb.0:
12581; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12582; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
12583; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
12584; GFX7-NEXT:    flat_load_dword v1, v[4:5]
12585; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
12586; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
12587; GFX7-NEXT:    s_mov_b64 s[4:5], 0
12588; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
12589; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
12590; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12591; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
12592; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
12593; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
12594; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
12595; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
12596; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
12597; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
12598; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
12599; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
12600; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
12601; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
12602; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
12603; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
12604; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
12605; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
12606; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
12607; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
12608; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
12609; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12610; GFX7-NEXT:    buffer_wbinvl1
12611; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
12612; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
12613; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
12614; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
12615; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12616; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12617; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
12618; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
12619; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
12620; GFX7-NEXT:    s_setpc_b64 s[30:31]
12621  %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
12622  %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst
12623  ret <2 x half> %result
12624}
12625
12626define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
12627; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2f16:
12628; GFX12:       ; %bb.0:
12629; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12630; GFX12-NEXT:    s_wait_expcnt 0x0
12631; GFX12-NEXT:    s_wait_samplecnt 0x0
12632; GFX12-NEXT:    s_wait_bvhcnt 0x0
12633; GFX12-NEXT:    s_wait_kmcnt 0x0
12634; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
12635; GFX12-NEXT:    s_mov_b32 s0, 0
12636; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
12637; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
12638; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12639; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12640; GFX12-NEXT:    s_wait_storecnt 0x0
12641; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12642; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12643; GFX12-NEXT:    global_inv scope:SCOPE_DEV
12644; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12645; GFX12-NEXT:    v_mov_b32_e32 v4, v3
12646; GFX12-NEXT:    s_wait_alu 0xfffe
12647; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
12648; GFX12-NEXT:    s_wait_alu 0xfffe
12649; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12650; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
12651; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
12652; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12653; GFX12-NEXT:    s_wait_alu 0xfffe
12654; GFX12-NEXT:    s_setpc_b64 s[30:31]
12655;
12656; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16:
12657; GFX940:       ; %bb.0:
12658; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12659; GFX940-NEXT:    flat_load_dword v5, v[0:1]
12660; GFX940-NEXT:    s_mov_b64 s[0:1], 0
12661; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
12662; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
12663; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12664; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
12665; GFX940-NEXT:    buffer_wbl2 sc1
12666; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
12667; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12668; GFX940-NEXT:    buffer_inv sc1
12669; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
12670; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
12671; GFX940-NEXT:    v_mov_b32_e32 v5, v3
12672; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
12673; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
12674; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
12675; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
12676; GFX940-NEXT:    s_setpc_b64 s[30:31]
12677;
12678; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16:
12679; GFX11:       ; %bb.0:
12680; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12681; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
12682; GFX11-NEXT:    s_mov_b32 s0, 0
12683; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
12684; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
12685; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12686; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12687; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
12688; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
12689; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12690; GFX11-NEXT:    buffer_gl1_inv
12691; GFX11-NEXT:    buffer_gl0_inv
12692; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12693; GFX11-NEXT:    v_mov_b32_e32 v4, v3
12694; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
12695; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12696; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12697; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
12698; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
12699; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12700; GFX11-NEXT:    s_setpc_b64 s[30:31]
12701;
12702; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2f16:
12703; GFX10:       ; %bb.0:
12704; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12705; GFX10-NEXT:    flat_load_dword v4, v[0:1]
12706; GFX10-NEXT:    s_mov_b32 s4, 0
12707; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
12708; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
12709; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12710; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12711; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
12712; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
12713; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12714; GFX10-NEXT:    buffer_gl1_inv
12715; GFX10-NEXT:    buffer_gl0_inv
12716; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12717; GFX10-NEXT:    v_mov_b32_e32 v4, v3
12718; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
12719; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
12720; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
12721; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
12722; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
12723; GFX10-NEXT:    s_setpc_b64 s[30:31]
12724;
12725; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2f16:
12726; GFX90A:       ; %bb.0:
12727; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12728; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
12729; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
12730; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
12731; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
12732; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12733; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
12734; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
12735; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12736; GFX90A-NEXT:    buffer_wbinvl1
12737; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
12738; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12739; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
12740; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12741; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
12742; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
12743; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
12744; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12745;
12746; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2f16:
12747; GFX908:       ; %bb.0:
12748; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12749; GFX908-NEXT:    flat_load_dword v4, v[0:1]
12750; GFX908-NEXT:    s_mov_b64 s[4:5], 0
12751; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
12752; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
12753; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12754; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12755; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
12756; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12757; GFX908-NEXT:    buffer_wbinvl1
12758; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
12759; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12760; GFX908-NEXT:    v_mov_b32_e32 v4, v3
12761; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12762; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
12763; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
12764; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
12765; GFX908-NEXT:    s_setpc_b64 s[30:31]
12766;
12767; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2f16:
12768; GFX8:       ; %bb.0:
12769; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12770; GFX8-NEXT:    flat_load_dword v4, v[0:1]
12771; GFX8-NEXT:    s_mov_b64 s[4:5], 0
12772; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
12773; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
12774; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12775; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
12776; GFX8-NEXT:    v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12777; GFX8-NEXT:    v_sub_f16_e32 v5, v4, v2
12778; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
12779; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
12780; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
12781; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12782; GFX8-NEXT:    buffer_wbinvl1
12783; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
12784; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12785; GFX8-NEXT:    v_mov_b32_e32 v4, v3
12786; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12787; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
12788; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
12789; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
12790; GFX8-NEXT:    s_setpc_b64 s[30:31]
12791;
12792; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2f16:
12793; GFX7:       ; %bb.0:
12794; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12795; GFX7-NEXT:    flat_load_dword v5, v[0:1]
12796; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
12797; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
12798; GFX7-NEXT:    s_mov_b64 s[4:5], 0
12799; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
12800; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12801; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
12802; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
12803; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
12804; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
12805; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
12806; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
12807; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
12808; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
12809; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
12810; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
12811; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
12812; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
12813; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
12814; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
12815; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
12816; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
12817; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
12818; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
12819; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
12820; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12821; GFX7-NEXT:    buffer_wbinvl1
12822; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
12823; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
12824; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
12825; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
12826; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12827; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12828; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
12829; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
12830; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
12831; GFX7-NEXT:    s_setpc_b64 s[30:31]
12832  %unused = atomicrmw fsub ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
12833  ret void
12834}
12835
12836define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
12837; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos:
12838; GFX12:       ; %bb.0:
12839; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12840; GFX12-NEXT:    s_wait_expcnt 0x0
12841; GFX12-NEXT:    s_wait_samplecnt 0x0
12842; GFX12-NEXT:    s_wait_bvhcnt 0x0
12843; GFX12-NEXT:    s_wait_kmcnt 0x0
12844; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
12845; GFX12-NEXT:    s_mov_b32 s0, 0
12846; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
12847; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
12848; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12849; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12850; GFX12-NEXT:    s_wait_storecnt 0x0
12851; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12852; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12853; GFX12-NEXT:    global_inv scope:SCOPE_DEV
12854; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12855; GFX12-NEXT:    v_mov_b32_e32 v4, v3
12856; GFX12-NEXT:    s_wait_alu 0xfffe
12857; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
12858; GFX12-NEXT:    s_wait_alu 0xfffe
12859; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12860; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
12861; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
12862; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12863; GFX12-NEXT:    s_wait_alu 0xfffe
12864; GFX12-NEXT:    s_setpc_b64 s[30:31]
12865;
12866; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos:
12867; GFX940:       ; %bb.0:
12868; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12869; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
12870; GFX940-NEXT:    s_mov_b64 s[0:1], 0
12871; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
12872; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
12873; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12874; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
12875; GFX940-NEXT:    buffer_wbl2 sc1
12876; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
12877; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12878; GFX940-NEXT:    buffer_inv sc1
12879; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
12880; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
12881; GFX940-NEXT:    v_mov_b32_e32 v5, v3
12882; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
12883; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
12884; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
12885; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
12886; GFX940-NEXT:    s_setpc_b64 s[30:31]
12887;
12888; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos:
12889; GFX11:       ; %bb.0:
12890; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12891; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
12892; GFX11-NEXT:    s_mov_b32 s0, 0
12893; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
12894; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
12895; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12896; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12897; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
12898; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
12899; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12900; GFX11-NEXT:    buffer_gl1_inv
12901; GFX11-NEXT:    buffer_gl0_inv
12902; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12903; GFX11-NEXT:    v_mov_b32_e32 v4, v3
12904; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
12905; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12906; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12907; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
12908; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
12909; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12910; GFX11-NEXT:    s_setpc_b64 s[30:31]
12911;
12912; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos:
12913; GFX10:       ; %bb.0:
12914; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12915; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
12916; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
12917; GFX10-NEXT:    s_mov_b32 s4, 0
12918; GFX10-NEXT:    flat_load_dword v4, v[0:1]
12919; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
12920; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
12921; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12922; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12923; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
12924; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
12925; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12926; GFX10-NEXT:    buffer_gl1_inv
12927; GFX10-NEXT:    buffer_gl0_inv
12928; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12929; GFX10-NEXT:    v_mov_b32_e32 v4, v3
12930; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
12931; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
12932; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
12933; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
12934; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
12935; GFX10-NEXT:    s_setpc_b64 s[30:31]
12936;
12937; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos:
12938; GFX90A:       ; %bb.0:
12939; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12940; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
12941; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
12942; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
12943; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
12944; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12945; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
12946; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
12947; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12948; GFX90A-NEXT:    buffer_wbinvl1
12949; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
12950; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12951; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
12952; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12953; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
12954; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
12955; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
12956; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12957;
12958; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos:
12959; GFX908:       ; %bb.0:
12960; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12961; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
12962; GFX908-NEXT:    s_mov_b64 s[4:5], 0
12963; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
12964; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
12965; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12966; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
12967; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
12968; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12969; GFX908-NEXT:    buffer_wbinvl1
12970; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
12971; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12972; GFX908-NEXT:    v_mov_b32_e32 v4, v3
12973; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12974; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
12975; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
12976; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
12977; GFX908-NEXT:    s_setpc_b64 s[30:31]
12978;
12979; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos:
12980; GFX8:       ; %bb.0:
12981; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12982; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
12983; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
12984; GFX8-NEXT:    flat_load_dword v4, v[0:1]
12985; GFX8-NEXT:    s_mov_b64 s[4:5], 0
12986; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
12987; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
12988; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12989; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
12990; GFX8-NEXT:    v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12991; GFX8-NEXT:    v_sub_f16_e32 v5, v4, v2
12992; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
12993; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
12994; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
12995; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12996; GFX8-NEXT:    buffer_wbinvl1
12997; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
12998; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12999; GFX8-NEXT:    v_mov_b32_e32 v4, v3
13000; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13001; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
13002; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
13003; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
13004; GFX8-NEXT:    s_setpc_b64 s[30:31]
13005;
13006; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos:
13007; GFX7:       ; %bb.0:
13008; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13009; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
13010; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
13011; GFX7-NEXT:    flat_load_dword v5, v[0:1]
13012; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
13013; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
13014; GFX7-NEXT:    s_mov_b64 s[4:5], 0
13015; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
13016; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13017; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
13018; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
13019; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
13020; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
13021; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
13022; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
13023; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
13024; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
13025; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
13026; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
13027; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
13028; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
13029; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
13030; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
13031; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
13032; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
13033; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
13034; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
13035; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
13036; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13037; GFX7-NEXT:    buffer_wbinvl1
13038; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
13039; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
13040; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
13041; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
13042; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13043; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13044; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
13045; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
13046; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
13047; GFX7-NEXT:    s_setpc_b64 s[30:31]
13048  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
13049  %unused = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst
13050  ret void
13051}
13052
13053define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x half> %val) #0 {
13054; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg:
13055; GFX12:       ; %bb.0:
13056; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13057; GFX12-NEXT:    s_wait_expcnt 0x0
13058; GFX12-NEXT:    s_wait_samplecnt 0x0
13059; GFX12-NEXT:    s_wait_bvhcnt 0x0
13060; GFX12-NEXT:    s_wait_kmcnt 0x0
13061; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:-2048
13062; GFX12-NEXT:    s_mov_b32 s0, 0
13063; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
13064; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
13065; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13066; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
13067; GFX12-NEXT:    s_wait_storecnt 0x0
13068; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13069; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13070; GFX12-NEXT:    global_inv scope:SCOPE_DEV
13071; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
13072; GFX12-NEXT:    v_mov_b32_e32 v4, v3
13073; GFX12-NEXT:    s_wait_alu 0xfffe
13074; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
13075; GFX12-NEXT:    s_wait_alu 0xfffe
13076; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13077; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
13078; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
13079; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13080; GFX12-NEXT:    s_wait_alu 0xfffe
13081; GFX12-NEXT:    s_setpc_b64 s[30:31]
13082;
13083; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg:
13084; GFX940:       ; %bb.0:
13085; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13086; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
13087; GFX940-NEXT:    s_movk_i32 s0, 0xf800
13088; GFX940-NEXT:    s_nop 0
13089; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
13090; GFX940-NEXT:    flat_load_dword v5, v[4:5]
13091; GFX940-NEXT:    s_mov_b32 s1, -1
13092; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
13093; GFX940-NEXT:    s_mov_b64 s[0:1], 0
13094; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
13095; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
13096; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13097; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
13098; GFX940-NEXT:    buffer_wbl2 sc1
13099; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
13100; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13101; GFX940-NEXT:    buffer_inv sc1
13102; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
13103; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
13104; GFX940-NEXT:    v_mov_b32_e32 v5, v3
13105; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
13106; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
13107; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
13108; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
13109; GFX940-NEXT:    s_setpc_b64 s[30:31]
13110;
13111; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg:
13112; GFX11:       ; %bb.0:
13113; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13114; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
13115; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
13116; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
13117; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
13118; GFX11-NEXT:    flat_load_b32 v4, v[3:4]
13119; GFX11-NEXT:    s_mov_b32 s0, 0
13120; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
13121; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
13122; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13123; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
13124; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
13125; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
13126; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13127; GFX11-NEXT:    buffer_gl1_inv
13128; GFX11-NEXT:    buffer_gl0_inv
13129; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
13130; GFX11-NEXT:    v_mov_b32_e32 v4, v3
13131; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
13132; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13133; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13134; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
13135; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
13136; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13137; GFX11-NEXT:    s_setpc_b64 s[30:31]
13138;
13139; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg:
13140; GFX10:       ; %bb.0:
13141; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13142; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
13143; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
13144; GFX10-NEXT:    s_mov_b32 s4, 0
13145; GFX10-NEXT:    flat_load_dword v4, v[0:1]
13146; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
13147; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
13148; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13149; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
13150; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
13151; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
13152; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13153; GFX10-NEXT:    buffer_gl1_inv
13154; GFX10-NEXT:    buffer_gl0_inv
13155; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
13156; GFX10-NEXT:    v_mov_b32_e32 v4, v3
13157; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
13158; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
13159; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
13160; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
13161; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
13162; GFX10-NEXT:    s_setpc_b64 s[30:31]
13163;
13164; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg:
13165; GFX90A:       ; %bb.0:
13166; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13167; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
13168; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
13169; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
13170; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
13171; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
13172; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
13173; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
13174; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
13175; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13176; GFX90A-NEXT:    v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
13177; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
13178; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13179; GFX90A-NEXT:    buffer_wbinvl1
13180; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
13181; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13182; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
13183; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13184; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
13185; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
13186; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
13187; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13188;
13189; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg:
13190; GFX908:       ; %bb.0:
13191; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13192; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
13193; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
13194; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
13195; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
13196; GFX908-NEXT:    flat_load_dword v1, v[0:1]
13197; GFX908-NEXT:    s_mov_b64 s[4:5], 0
13198; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
13199; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
13200; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13201; GFX908-NEXT:    v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
13202; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
13203; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13204; GFX908-NEXT:    buffer_wbinvl1
13205; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
13206; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13207; GFX908-NEXT:    v_mov_b32_e32 v1, v0
13208; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13209; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
13210; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
13211; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
13212; GFX908-NEXT:    s_setpc_b64 s[30:31]
13213;
13214; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg:
13215; GFX8:       ; %bb.0:
13216; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13217; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
13218; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
13219; GFX8-NEXT:    flat_load_dword v4, v[0:1]
13220; GFX8-NEXT:    s_mov_b64 s[4:5], 0
13221; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
13222; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
13223; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13224; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
13225; GFX8-NEXT:    v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13226; GFX8-NEXT:    v_sub_f16_e32 v5, v4, v2
13227; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13228; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
13229; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
13230; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13231; GFX8-NEXT:    buffer_wbinvl1
13232; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
13233; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13234; GFX8-NEXT:    v_mov_b32_e32 v4, v3
13235; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13236; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
13237; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
13238; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
13239; GFX8-NEXT:    s_setpc_b64 s[30:31]
13240;
13241; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg:
13242; GFX7:       ; %bb.0:
13243; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13244; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
13245; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
13246; GFX7-NEXT:    flat_load_dword v5, v[0:1]
13247; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
13248; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
13249; GFX7-NEXT:    s_mov_b64 s[4:5], 0
13250; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
13251; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13252; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
13253; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
13254; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
13255; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
13256; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
13257; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
13258; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
13259; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
13260; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
13261; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
13262; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
13263; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
13264; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
13265; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
13266; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
13267; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
13268; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
13269; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
13270; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
13271; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13272; GFX7-NEXT:    buffer_wbinvl1
13273; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
13274; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
13275; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
13276; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
13277; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13278; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13279; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
13280; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
13281; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
13282; GFX7-NEXT:    s_setpc_b64 s[30:31]
13283  %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
13284  %unused = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst
13285  ret void
13286}
13287
13288define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
13289; GFX12-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos:
13290; GFX12:       ; %bb.0:
13291; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13292; GFX12-NEXT:    s_wait_expcnt 0x0
13293; GFX12-NEXT:    s_wait_samplecnt 0x0
13294; GFX12-NEXT:    s_wait_bvhcnt 0x0
13295; GFX12-NEXT:    s_wait_kmcnt 0x0
13296; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
13297; GFX12-NEXT:    s_mov_b32 s0, 0
13298; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
13299; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
13300; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13301; GFX12-NEXT:    v_mov_b32_e32 v4, v3
13302; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13303; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
13304; GFX12-NEXT:    global_wb scope:SCOPE_SYS
13305; GFX12-NEXT:    s_wait_storecnt 0x0
13306; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13307; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13308; GFX12-NEXT:    global_inv scope:SCOPE_SYS
13309; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
13310; GFX12-NEXT:    s_wait_alu 0xfffe
13311; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
13312; GFX12-NEXT:    s_wait_alu 0xfffe
13313; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13314; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
13315; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
13316; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13317; GFX12-NEXT:    v_mov_b32_e32 v0, v3
13318; GFX12-NEXT:    s_wait_alu 0xfffe
13319; GFX12-NEXT:    s_setpc_b64 s[30:31]
13320;
13321; GFX940-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos:
13322; GFX940:       ; %bb.0:
13323; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13324; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
13325; GFX940-NEXT:    s_mov_b64 s[0:1], 0
13326; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
13327; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
13328; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13329; GFX940-NEXT:    v_mov_b32_e32 v5, v3
13330; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
13331; GFX940-NEXT:    buffer_wbl2 sc0 sc1
13332; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
13333; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13334; GFX940-NEXT:    buffer_inv sc0 sc1
13335; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
13336; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
13337; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
13338; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
13339; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
13340; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
13341; GFX940-NEXT:    v_mov_b32_e32 v0, v3
13342; GFX940-NEXT:    s_setpc_b64 s[30:31]
13343;
13344; GFX11-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos:
13345; GFX11:       ; %bb.0:
13346; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13347; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
13348; GFX11-NEXT:    s_mov_b32 s0, 0
13349; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
13350; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
13351; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13352; GFX11-NEXT:    v_mov_b32_e32 v4, v3
13353; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13354; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
13355; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
13356; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
13357; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13358; GFX11-NEXT:    buffer_gl1_inv
13359; GFX11-NEXT:    buffer_gl0_inv
13360; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
13361; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
13362; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13363; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13364; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
13365; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
13366; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13367; GFX11-NEXT:    v_mov_b32_e32 v0, v3
13368; GFX11-NEXT:    s_setpc_b64 s[30:31]
13369;
13370; GFX10-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos:
13371; GFX10:       ; %bb.0:
13372; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13373; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
13374; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
13375; GFX10-NEXT:    s_mov_b32 s4, 0
13376; GFX10-NEXT:    flat_load_dword v0, v[3:4]
13377; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
13378; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
13379; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13380; GFX10-NEXT:    v_mov_b32_e32 v1, v0
13381; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
13382; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
13383; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
13384; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13385; GFX10-NEXT:    buffer_gl1_inv
13386; GFX10-NEXT:    buffer_gl0_inv
13387; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
13388; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
13389; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
13390; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
13391; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
13392; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
13393; GFX10-NEXT:    s_setpc_b64 s[30:31]
13394;
13395; GFX90A-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos:
13396; GFX90A:       ; %bb.0:
13397; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13398; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
13399; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
13400; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
13401; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
13402; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13403; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
13404; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
13405; GFX90A-NEXT:    buffer_wbl2
13406; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
13407; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13408; GFX90A-NEXT:    buffer_invl2
13409; GFX90A-NEXT:    buffer_wbinvl1
13410; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
13411; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13412; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13413; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
13414; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
13415; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
13416; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
13417; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13418;
13419; GFX908-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos:
13420; GFX908:       ; %bb.0:
13421; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13422; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
13423; GFX908-NEXT:    s_mov_b64 s[4:5], 0
13424; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
13425; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
13426; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13427; GFX908-NEXT:    v_mov_b32_e32 v4, v3
13428; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
13429; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
13430; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13431; GFX908-NEXT:    buffer_wbinvl1
13432; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
13433; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13434; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13435; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
13436; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
13437; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
13438; GFX908-NEXT:    v_mov_b32_e32 v0, v3
13439; GFX908-NEXT:    s_setpc_b64 s[30:31]
13440;
13441; GFX8-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos:
13442; GFX8:       ; %bb.0:
13443; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13444; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
13445; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
13446; GFX8-NEXT:    flat_load_dword v0, v[3:4]
13447; GFX8-NEXT:    s_mov_b64 s[4:5], 0
13448; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
13449; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
13450; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13451; GFX8-NEXT:    v_mov_b32_e32 v1, v0
13452; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
13453; GFX8-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13454; GFX8-NEXT:    v_sub_f16_e32 v5, v1, v2
13455; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
13456; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
13457; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
13458; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13459; GFX8-NEXT:    buffer_wbinvl1
13460; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
13461; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13462; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13463; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
13464; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
13465; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
13466; GFX8-NEXT:    s_setpc_b64 s[30:31]
13467;
13468; GFX7-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos:
13469; GFX7:       ; %bb.0:
13470; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13471; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
13472; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
13473; GFX7-NEXT:    flat_load_dword v1, v[4:5]
13474; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
13475; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
13476; GFX7-NEXT:    s_mov_b64 s[4:5], 0
13477; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
13478; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
13479; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13480; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
13481; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
13482; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
13483; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
13484; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
13485; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
13486; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
13487; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
13488; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
13489; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
13490; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
13491; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
13492; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
13493; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
13494; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
13495; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
13496; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
13497; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
13498; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13499; GFX7-NEXT:    buffer_wbinvl1
13500; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
13501; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
13502; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
13503; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
13504; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13505; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13506; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
13507; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
13508; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
13509; GFX7-NEXT:    s_setpc_b64 s[30:31]
13510  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
13511  %result = atomicrmw fsub ptr %gep, <2 x half> %val seq_cst
13512  ret <2 x half> %result
13513}
13514
13515define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
13516; GFX12-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos:
13517; GFX12:       ; %bb.0:
13518; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13519; GFX12-NEXT:    s_wait_expcnt 0x0
13520; GFX12-NEXT:    s_wait_samplecnt 0x0
13521; GFX12-NEXT:    s_wait_bvhcnt 0x0
13522; GFX12-NEXT:    s_wait_kmcnt 0x0
13523; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
13524; GFX12-NEXT:    s_mov_b32 s0, 0
13525; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
13526; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
13527; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13528; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
13529; GFX12-NEXT:    global_wb scope:SCOPE_SYS
13530; GFX12-NEXT:    s_wait_storecnt 0x0
13531; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13532; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13533; GFX12-NEXT:    global_inv scope:SCOPE_SYS
13534; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
13535; GFX12-NEXT:    v_mov_b32_e32 v4, v3
13536; GFX12-NEXT:    s_wait_alu 0xfffe
13537; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
13538; GFX12-NEXT:    s_wait_alu 0xfffe
13539; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13540; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
13541; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
13542; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13543; GFX12-NEXT:    s_wait_alu 0xfffe
13544; GFX12-NEXT:    s_setpc_b64 s[30:31]
13545;
13546; GFX940-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos:
13547; GFX940:       ; %bb.0:
13548; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13549; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2044
13550; GFX940-NEXT:    s_mov_b64 s[0:1], 0
13551; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
13552; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
13553; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13554; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
13555; GFX940-NEXT:    buffer_wbl2 sc0 sc1
13556; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
13557; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13558; GFX940-NEXT:    buffer_inv sc0 sc1
13559; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
13560; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
13561; GFX940-NEXT:    v_mov_b32_e32 v5, v3
13562; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
13563; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
13564; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
13565; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
13566; GFX940-NEXT:    s_setpc_b64 s[30:31]
13567;
13568; GFX11-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos:
13569; GFX11:       ; %bb.0:
13570; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13571; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
13572; GFX11-NEXT:    s_mov_b32 s0, 0
13573; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
13574; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
13575; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13576; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
13577; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
13578; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
13579; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13580; GFX11-NEXT:    buffer_gl1_inv
13581; GFX11-NEXT:    buffer_gl0_inv
13582; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
13583; GFX11-NEXT:    v_mov_b32_e32 v4, v3
13584; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
13585; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13586; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13587; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
13588; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
13589; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13590; GFX11-NEXT:    s_setpc_b64 s[30:31]
13591;
13592; GFX10-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos:
13593; GFX10:       ; %bb.0:
13594; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13595; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
13596; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
13597; GFX10-NEXT:    s_mov_b32 s4, 0
13598; GFX10-NEXT:    flat_load_dword v4, v[0:1]
13599; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
13600; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
13601; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13602; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
13603; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
13604; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
13605; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13606; GFX10-NEXT:    buffer_gl1_inv
13607; GFX10-NEXT:    buffer_gl0_inv
13608; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
13609; GFX10-NEXT:    v_mov_b32_e32 v4, v3
13610; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
13611; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
13612; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
13613; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
13614; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
13615; GFX10-NEXT:    s_setpc_b64 s[30:31]
13616;
13617; GFX90A-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos:
13618; GFX90A:       ; %bb.0:
13619; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13620; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
13621; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
13622; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
13623; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
13624; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13625; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
13626; GFX90A-NEXT:    buffer_wbl2
13627; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
13628; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13629; GFX90A-NEXT:    buffer_invl2
13630; GFX90A-NEXT:    buffer_wbinvl1
13631; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
13632; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13633; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
13634; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13635; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
13636; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
13637; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
13638; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13639;
13640; GFX908-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos:
13641; GFX908:       ; %bb.0:
13642; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13643; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
13644; GFX908-NEXT:    s_mov_b64 s[4:5], 0
13645; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
13646; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
13647; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13648; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
13649; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
13650; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13651; GFX908-NEXT:    buffer_wbinvl1
13652; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
13653; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13654; GFX908-NEXT:    v_mov_b32_e32 v4, v3
13655; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13656; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
13657; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
13658; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
13659; GFX908-NEXT:    s_setpc_b64 s[30:31]
13660;
13661; GFX8-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos:
13662; GFX8:       ; %bb.0:
13663; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13664; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
13665; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
13666; GFX8-NEXT:    flat_load_dword v4, v[0:1]
13667; GFX8-NEXT:    s_mov_b64 s[4:5], 0
13668; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
13669; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
13670; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13671; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
13672; GFX8-NEXT:    v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13673; GFX8-NEXT:    v_sub_f16_e32 v5, v4, v2
13674; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13675; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
13676; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
13677; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13678; GFX8-NEXT:    buffer_wbinvl1
13679; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
13680; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13681; GFX8-NEXT:    v_mov_b32_e32 v4, v3
13682; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13683; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
13684; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
13685; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
13686; GFX8-NEXT:    s_setpc_b64 s[30:31]
13687;
13688; GFX7-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos:
13689; GFX7:       ; %bb.0:
13690; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13691; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
13692; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
13693; GFX7-NEXT:    flat_load_dword v5, v[0:1]
13694; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
13695; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
13696; GFX7-NEXT:    s_mov_b64 s[4:5], 0
13697; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
13698; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13699; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
13700; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
13701; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
13702; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
13703; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
13704; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
13705; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
13706; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
13707; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
13708; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
13709; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
13710; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
13711; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
13712; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
13713; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
13714; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
13715; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
13716; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
13717; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
13718; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13719; GFX7-NEXT:    buffer_wbinvl1
13720; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
13721; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
13722; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
13723; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
13724; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13725; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13726; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
13727; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
13728; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
13729; GFX7-NEXT:    s_setpc_b64 s[30:31]
13730  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
13731  %unused = atomicrmw fsub ptr %gep, <2 x half> %val seq_cst
13732  ret void
13733}
13734
13735; --------------------------------------------------------------------
13736; <2 x bfloat>
13737; --------------------------------------------------------------------
13738
13739define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 {
13740; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
13741; GFX12:       ; %bb.0:
13742; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13743; GFX12-NEXT:    s_wait_expcnt 0x0
13744; GFX12-NEXT:    s_wait_samplecnt 0x0
13745; GFX12-NEXT:    s_wait_bvhcnt 0x0
13746; GFX12-NEXT:    s_wait_kmcnt 0x0
13747; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
13748; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13749; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13750; GFX12-NEXT:    s_mov_b32 s1, 0
13751; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
13752; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
13753; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13754; GFX12-NEXT:    v_mov_b32_e32 v6, v3
13755; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13756; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
13757; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
13758; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
13759; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
13760; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
13761; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v4
13762; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
13763; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
13764; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
13765; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
13766; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
13767; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
13768; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
13769; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
13770; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
13771; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
13772; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13773; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
13774; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
13775; GFX12-NEXT:    s_wait_storecnt 0x0
13776; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13777; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13778; GFX12-NEXT:    global_inv scope:SCOPE_DEV
13779; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
13780; GFX12-NEXT:    s_wait_alu 0xfffe
13781; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
13782; GFX12-NEXT:    s_wait_alu 0xfffe
13783; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
13784; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
13785; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
13786; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
13787; GFX12-NEXT:    v_mov_b32_e32 v0, v3
13788; GFX12-NEXT:    s_wait_alu 0xfffe
13789; GFX12-NEXT:    s_setpc_b64 s[30:31]
13790;
13791; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
13792; GFX940:       ; %bb.0:
13793; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13794; GFX940-NEXT:    flat_load_dword v3, v[0:1]
13795; GFX940-NEXT:    s_mov_b64 s[2:3], 0
13796; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13797; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
13798; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13799; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
13800; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
13801; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
13802; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13803; GFX940-NEXT:    v_mov_b32_e32 v7, v3
13804; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
13805; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
13806; GFX940-NEXT:    v_sub_f32_e32 v3, v3, v4
13807; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
13808; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
13809; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
13810; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
13811; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
13812; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
13813; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
13814; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
13815; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
13816; GFX940-NEXT:    s_nop 0
13817; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
13818; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
13819; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
13820; GFX940-NEXT:    buffer_wbl2 sc1
13821; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
13822; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13823; GFX940-NEXT:    buffer_inv sc1
13824; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
13825; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
13826; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
13827; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
13828; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
13829; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
13830; GFX940-NEXT:    v_mov_b32_e32 v0, v3
13831; GFX940-NEXT:    s_setpc_b64 s[30:31]
13832;
13833; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
13834; GFX11:       ; %bb.0:
13835; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13836; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
13837; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13838; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13839; GFX11-NEXT:    s_mov_b32 s1, 0
13840; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
13841; GFX11-NEXT:    .p2align 6
13842; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
13843; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
13844; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13845; GFX11-NEXT:    v_mov_b32_e32 v6, v3
13846; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13847; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
13848; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
13849; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
13850; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
13851; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
13852; GFX11-NEXT:    v_sub_f32_e32 v3, v3, v4
13853; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
13854; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
13855; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
13856; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
13857; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
13858; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
13859; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
13860; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
13861; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
13862; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
13863; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13864; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
13865; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
13866; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
13867; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
13868; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13869; GFX11-NEXT:    buffer_gl1_inv
13870; GFX11-NEXT:    buffer_gl0_inv
13871; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
13872; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
13873; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13874; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
13875; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
13876; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
13877; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
13878; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
13879; GFX11-NEXT:    v_mov_b32_e32 v0, v3
13880; GFX11-NEXT:    s_setpc_b64 s[30:31]
13881;
13882; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
13883; GFX10:       ; %bb.0:
13884; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13885; GFX10-NEXT:    flat_load_dword v3, v[0:1]
13886; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13887; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13888; GFX10-NEXT:    s_mov_b32 s5, 0
13889; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
13890; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
13891; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13892; GFX10-NEXT:    v_mov_b32_e32 v6, v3
13893; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
13894; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
13895; GFX10-NEXT:    v_sub_f32_e32 v3, v3, v4
13896; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
13897; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
13898; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
13899; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
13900; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
13901; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
13902; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
13903; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
13904; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
13905; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
13906; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
13907; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
13908; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
13909; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
13910; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13911; GFX10-NEXT:    buffer_gl1_inv
13912; GFX10-NEXT:    buffer_gl0_inv
13913; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
13914; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
13915; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
13916; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
13917; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
13918; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
13919; GFX10-NEXT:    v_mov_b32_e32 v0, v3
13920; GFX10-NEXT:    s_setpc_b64 s[30:31]
13921;
13922; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
13923; GFX90A:       ; %bb.0:
13924; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13925; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
13926; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
13927; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13928; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
13929; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13930; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
13931; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
13932; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
13933; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13934; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
13935; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
13936; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
13937; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v4
13938; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
13939; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
13940; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
13941; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
13942; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
13943; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
13944; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
13945; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
13946; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
13947; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
13948; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
13949; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
13950; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
13951; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13952; GFX90A-NEXT:    buffer_wbinvl1
13953; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
13954; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
13955; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
13956; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
13957; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
13958; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
13959; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
13960; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13961;
13962; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
13963; GFX908:       ; %bb.0:
13964; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13965; GFX908-NEXT:    flat_load_dword v3, v[0:1]
13966; GFX908-NEXT:    s_mov_b64 s[6:7], 0
13967; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13968; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
13969; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13970; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
13971; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
13972; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
13973; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13974; GFX908-NEXT:    v_mov_b32_e32 v6, v3
13975; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
13976; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
13977; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v4
13978; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
13979; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
13980; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
13981; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
13982; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
13983; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
13984; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
13985; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
13986; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
13987; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
13988; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
13989; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
13990; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
13991; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13992; GFX908-NEXT:    buffer_wbinvl1
13993; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
13994; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
13995; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
13996; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
13997; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
13998; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
13999; GFX908-NEXT:    v_mov_b32_e32 v0, v3
14000; GFX908-NEXT:    s_setpc_b64 s[30:31]
14001;
14002; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
14003; GFX8:       ; %bb.0:
14004; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14005; GFX8-NEXT:    flat_load_dword v3, v[0:1]
14006; GFX8-NEXT:    s_mov_b64 s[6:7], 0
14007; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14008; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14009; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
14010; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
14011; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14012; GFX8-NEXT:    v_mov_b32_e32 v6, v3
14013; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14014; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14015; GFX8-NEXT:    v_sub_f32_e32 v3, v3, v4
14016; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
14017; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
14018; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
14019; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
14020; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
14021; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
14022; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
14023; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14024; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14025; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
14026; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
14027; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14028; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
14029; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
14030; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
14031; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
14032; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14033; GFX8-NEXT:    buffer_wbinvl1
14034; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
14035; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
14036; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
14037; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
14038; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
14039; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
14040; GFX8-NEXT:    v_mov_b32_e32 v0, v3
14041; GFX8-NEXT:    s_setpc_b64 s[30:31]
14042;
14043; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
14044; GFX7:       ; %bb.0:
14045; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14046; GFX7-NEXT:    flat_load_dword v5, v[0:1]
14047; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
14048; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
14049; GFX7-NEXT:    s_mov_b64 s[4:5], 0
14050; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
14051; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14052; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
14053; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
14054; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14055; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
14056; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
14057; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
14058; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
14059; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
14060; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
14061; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
14062; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v5
14063; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v4
14064; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
14065; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
14066; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
14067; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
14068; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14069; GFX7-NEXT:    buffer_wbinvl1
14070; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
14071; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
14072; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14073; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14074; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14075; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
14076; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
14077; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
14078; GFX7-NEXT:    v_mov_b32_e32 v0, v3
14079; GFX7-NEXT:    v_mov_b32_e32 v1, v2
14080; GFX7-NEXT:    s_setpc_b64 s[30:31]
14081  %result = atomicrmw fsub ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
14082  ret <2 x bfloat> %result
14083}
14084
14085define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
14086; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
14087; GFX12:       ; %bb.0:
14088; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14089; GFX12-NEXT:    s_wait_expcnt 0x0
14090; GFX12-NEXT:    s_wait_samplecnt 0x0
14091; GFX12-NEXT:    s_wait_bvhcnt 0x0
14092; GFX12-NEXT:    s_wait_kmcnt 0x0
14093; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
14094; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14095; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14096; GFX12-NEXT:    s_mov_b32 s1, 0
14097; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
14098; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
14099; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14100; GFX12-NEXT:    v_mov_b32_e32 v6, v3
14101; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14102; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14103; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
14104; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14105; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14106; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
14107; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v4
14108; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14109; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14110; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14111; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
14112; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
14113; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
14114; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
14115; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14116; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
14117; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
14118; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14119; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
14120; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
14121; GFX12-NEXT:    s_wait_storecnt 0x0
14122; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
14123; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14124; GFX12-NEXT:    global_inv scope:SCOPE_DEV
14125; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
14126; GFX12-NEXT:    s_wait_alu 0xfffe
14127; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
14128; GFX12-NEXT:    s_wait_alu 0xfffe
14129; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
14130; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
14131; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
14132; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14133; GFX12-NEXT:    v_mov_b32_e32 v0, v3
14134; GFX12-NEXT:    s_wait_alu 0xfffe
14135; GFX12-NEXT:    s_setpc_b64 s[30:31]
14136;
14137; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
14138; GFX940:       ; %bb.0:
14139; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14140; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
14141; GFX940-NEXT:    s_mov_b64 s[2:3], 0
14142; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14143; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
14144; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14145; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
14146; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
14147; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
14148; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14149; GFX940-NEXT:    v_mov_b32_e32 v7, v3
14150; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
14151; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
14152; GFX940-NEXT:    v_sub_f32_e32 v3, v3, v4
14153; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
14154; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
14155; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
14156; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
14157; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14158; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
14159; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
14160; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14161; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
14162; GFX940-NEXT:    s_nop 0
14163; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14164; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
14165; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
14166; GFX940-NEXT:    buffer_wbl2 sc1
14167; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0
14168; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14169; GFX940-NEXT:    buffer_inv sc1
14170; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
14171; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
14172; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
14173; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
14174; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
14175; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
14176; GFX940-NEXT:    v_mov_b32_e32 v0, v3
14177; GFX940-NEXT:    s_setpc_b64 s[30:31]
14178;
14179; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
14180; GFX11:       ; %bb.0:
14181; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14182; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
14183; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14184; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14185; GFX11-NEXT:    s_mov_b32 s1, 0
14186; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
14187; GFX11-NEXT:    .p2align 6
14188; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
14189; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
14190; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14191; GFX11-NEXT:    v_mov_b32_e32 v6, v3
14192; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14193; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14194; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
14195; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14196; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14197; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
14198; GFX11-NEXT:    v_sub_f32_e32 v3, v3, v4
14199; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14200; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14201; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14202; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
14203; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
14204; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
14205; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
14206; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14207; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
14208; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
14209; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14210; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
14211; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
14212; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
14213; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
14214; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14215; GFX11-NEXT:    buffer_gl1_inv
14216; GFX11-NEXT:    buffer_gl0_inv
14217; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
14218; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
14219; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
14220; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
14221; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
14222; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
14223; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
14224; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14225; GFX11-NEXT:    v_mov_b32_e32 v0, v3
14226; GFX11-NEXT:    s_setpc_b64 s[30:31]
14227;
14228; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
14229; GFX10:       ; %bb.0:
14230; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14231; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
14232; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
14233; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
14234; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14235; GFX10-NEXT:    s_mov_b32 s5, 0
14236; GFX10-NEXT:    flat_load_dword v0, v[3:4]
14237; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
14238; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
14239; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14240; GFX10-NEXT:    v_mov_b32_e32 v6, v0
14241; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
14242; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14243; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
14244; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
14245; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
14246; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
14247; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
14248; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14249; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14250; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
14251; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
14252; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
14253; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
14254; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
14255; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
14256; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
14257; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
14258; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14259; GFX10-NEXT:    buffer_gl1_inv
14260; GFX10-NEXT:    buffer_gl0_inv
14261; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
14262; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
14263; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
14264; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
14265; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
14266; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
14267; GFX10-NEXT:    s_setpc_b64 s[30:31]
14268;
14269; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
14270; GFX90A:       ; %bb.0:
14271; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14272; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
14273; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
14274; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14275; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
14276; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14277; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
14278; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
14279; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
14280; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14281; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
14282; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
14283; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
14284; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v4
14285; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
14286; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
14287; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
14288; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
14289; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14290; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
14291; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
14292; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14293; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
14294; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
14295; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14296; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
14297; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
14298; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14299; GFX90A-NEXT:    buffer_wbinvl1
14300; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
14301; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
14302; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
14303; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
14304; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
14305; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
14306; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
14307; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14308;
14309; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
14310; GFX908:       ; %bb.0:
14311; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14312; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
14313; GFX908-NEXT:    s_mov_b64 s[6:7], 0
14314; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14315; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
14316; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14317; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
14318; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
14319; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
14320; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14321; GFX908-NEXT:    v_mov_b32_e32 v6, v3
14322; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14323; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14324; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v4
14325; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
14326; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
14327; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
14328; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
14329; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14330; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
14331; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
14332; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14333; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
14334; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
14335; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14336; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
14337; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
14338; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14339; GFX908-NEXT:    buffer_wbinvl1
14340; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
14341; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
14342; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
14343; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
14344; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
14345; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
14346; GFX908-NEXT:    v_mov_b32_e32 v0, v3
14347; GFX908-NEXT:    s_setpc_b64 s[30:31]
14348;
14349; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
14350; GFX8:       ; %bb.0:
14351; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14352; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
14353; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
14354; GFX8-NEXT:    flat_load_dword v0, v[3:4]
14355; GFX8-NEXT:    s_mov_b64 s[6:7], 0
14356; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
14357; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14358; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
14359; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
14360; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14361; GFX8-NEXT:    v_mov_b32_e32 v6, v0
14362; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
14363; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14364; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
14365; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
14366; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
14367; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
14368; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
14369; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
14370; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
14371; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
14372; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14373; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14374; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
14375; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
14376; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14377; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
14378; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
14379; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
14380; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
14381; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14382; GFX8-NEXT:    buffer_wbinvl1
14383; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
14384; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
14385; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
14386; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
14387; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
14388; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
14389; GFX8-NEXT:    s_setpc_b64 s[30:31]
14390;
14391; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
14392; GFX7:       ; %bb.0:
14393; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14394; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
14395; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
14396; GFX7-NEXT:    flat_load_dword v0, v[4:5]
14397; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
14398; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
14399; GFX7-NEXT:    s_mov_b64 s[4:5], 0
14400; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
14401; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14402; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14403; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
14404; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
14405; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
14406; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
14407; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
14408; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
14409; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
14410; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
14411; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
14412; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
14413; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
14414; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
14415; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
14416; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
14417; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
14418; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14419; GFX7-NEXT:    buffer_wbinvl1
14420; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
14421; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
14422; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14423; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
14424; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14425; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
14426; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
14427; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
14428; GFX7-NEXT:    s_setpc_b64 s[30:31]
14429  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
14430  %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
14431  ret <2 x bfloat> %result
14432}
14433
14434define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 {
14435; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
14436; GFX12:       ; %bb.0:
14437; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14438; GFX12-NEXT:    s_wait_expcnt 0x0
14439; GFX12-NEXT:    s_wait_samplecnt 0x0
14440; GFX12-NEXT:    s_wait_bvhcnt 0x0
14441; GFX12-NEXT:    s_wait_kmcnt 0x0
14442; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
14443; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14444; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14445; GFX12-NEXT:    s_mov_b32 s1, 0
14446; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
14447; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
14448; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14449; GFX12-NEXT:    v_mov_b32_e32 v6, v3
14450; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14451; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14452; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
14453; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14454; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14455; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
14456; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v4
14457; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14458; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14459; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14460; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
14461; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
14462; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
14463; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
14464; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14465; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
14466; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
14467; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14468; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
14469; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
14470; GFX12-NEXT:    s_wait_storecnt 0x0
14471; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
14472; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14473; GFX12-NEXT:    global_inv scope:SCOPE_DEV
14474; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
14475; GFX12-NEXT:    s_wait_alu 0xfffe
14476; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
14477; GFX12-NEXT:    s_wait_alu 0xfffe
14478; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
14479; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
14480; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
14481; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14482; GFX12-NEXT:    v_mov_b32_e32 v0, v3
14483; GFX12-NEXT:    s_wait_alu 0xfffe
14484; GFX12-NEXT:    s_setpc_b64 s[30:31]
14485;
14486; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
14487; GFX940:       ; %bb.0:
14488; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14489; GFX940-NEXT:    v_mov_b32_e32 v4, v0
14490; GFX940-NEXT:    v_mov_b32_e32 v5, v1
14491; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
14492; GFX940-NEXT:    s_movk_i32 s0, 0xf800
14493; GFX940-NEXT:    s_nop 0
14494; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
14495; GFX940-NEXT:    flat_load_dword v0, v[0:1]
14496; GFX940-NEXT:    s_mov_b32 s1, -1
14497; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
14498; GFX940-NEXT:    s_mov_b64 s[2:3], 0
14499; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
14500; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
14501; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14502; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
14503; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
14504; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
14505; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14506; GFX940-NEXT:    v_mov_b32_e32 v7, v0
14507; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
14508; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
14509; GFX940-NEXT:    v_sub_f32_e32 v0, v0, v1
14510; GFX940-NEXT:    v_sub_f32_e32 v3, v3, v2
14511; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
14512; GFX940-NEXT:    v_bfe_u32 v9, v3, 16, 1
14513; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v0
14514; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v3
14515; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s4
14516; GFX940-NEXT:    v_add3_u32 v9, v9, v3, s4
14517; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
14518; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
14519; GFX940-NEXT:    s_nop 0
14520; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
14521; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[0:1]
14522; GFX940-NEXT:    v_perm_b32 v6, v3, v0, s5
14523; GFX940-NEXT:    buffer_wbl2 sc1
14524; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
14525; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14526; GFX940-NEXT:    buffer_inv sc1
14527; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
14528; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
14529; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
14530; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
14531; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
14532; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
14533; GFX940-NEXT:    s_setpc_b64 s[30:31]
14534;
14535; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
14536; GFX11:       ; %bb.0:
14537; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14538; GFX11-NEXT:    v_mov_b32_e32 v3, v0
14539; GFX11-NEXT:    s_mov_b32 s1, 0
14540; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14541; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
14542; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
14543; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
14544; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
14545; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
14546; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
14547; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14548; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
14549; GFX11-NEXT:    .p2align 6
14550; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
14551; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
14552; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14553; GFX11-NEXT:    v_mov_b32_e32 v6, v0
14554; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14555; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14556; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
14557; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
14558; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14559; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
14560; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
14561; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14562; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14563; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14564; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
14565; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
14566; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
14567; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
14568; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14569; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
14570; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
14571; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14572; GFX11-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s0
14573; GFX11-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
14574; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
14575; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
14576; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14577; GFX11-NEXT:    buffer_gl1_inv
14578; GFX11-NEXT:    buffer_gl0_inv
14579; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
14580; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
14581; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
14582; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
14583; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
14584; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
14585; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
14586; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14587; GFX11-NEXT:    s_setpc_b64 s[30:31]
14588;
14589; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
14590; GFX10:       ; %bb.0:
14591; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14592; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
14593; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
14594; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
14595; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14596; GFX10-NEXT:    s_mov_b32 s5, 0
14597; GFX10-NEXT:    flat_load_dword v0, v[3:4]
14598; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
14599; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
14600; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14601; GFX10-NEXT:    v_mov_b32_e32 v6, v0
14602; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
14603; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14604; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
14605; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
14606; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
14607; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
14608; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
14609; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14610; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14611; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
14612; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
14613; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
14614; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
14615; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
14616; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
14617; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
14618; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
14619; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14620; GFX10-NEXT:    buffer_gl1_inv
14621; GFX10-NEXT:    buffer_gl0_inv
14622; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
14623; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
14624; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
14625; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
14626; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
14627; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
14628; GFX10-NEXT:    s_setpc_b64 s[30:31]
14629;
14630; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
14631; GFX90A:       ; %bb.0:
14632; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14633; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
14634; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
14635; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
14636; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
14637; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
14638; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
14639; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
14640; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
14641; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14642; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
14643; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
14644; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
14645; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14646; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
14647; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
14648; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
14649; GFX90A-NEXT:    v_sub_f32_e32 v0, v0, v1
14650; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v2
14651; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
14652; GFX90A-NEXT:    v_bfe_u32 v9, v3, 16, 1
14653; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
14654; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v3
14655; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
14656; GFX90A-NEXT:    v_add3_u32 v9, v9, v3, s8
14657; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
14658; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
14659; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[4:5]
14660; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
14661; GFX90A-NEXT:    v_perm_b32 v6, v3, v0, s9
14662; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
14663; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14664; GFX90A-NEXT:    buffer_wbinvl1
14665; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
14666; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
14667; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
14668; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
14669; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
14670; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
14671; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14672;
14673; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
14674; GFX908:       ; %bb.0:
14675; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14676; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
14677; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
14678; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
14679; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
14680; GFX908-NEXT:    flat_load_dword v0, v[0:1]
14681; GFX908-NEXT:    s_mov_b64 s[6:7], 0
14682; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
14683; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
14684; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14685; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
14686; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
14687; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
14688; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14689; GFX908-NEXT:    v_mov_b32_e32 v6, v0
14690; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
14691; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14692; GFX908-NEXT:    v_sub_f32_e32 v0, v0, v1
14693; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
14694; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
14695; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
14696; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
14697; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14698; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
14699; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
14700; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14701; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
14702; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
14703; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14704; GFX908-NEXT:    v_perm_b32 v5, v5, v0, s9
14705; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
14706; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14707; GFX908-NEXT:    buffer_wbinvl1
14708; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
14709; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
14710; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
14711; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
14712; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
14713; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
14714; GFX908-NEXT:    s_setpc_b64 s[30:31]
14715;
14716; GFX8-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
14717; GFX8:       ; %bb.0:
14718; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14719; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
14720; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
14721; GFX8-NEXT:    flat_load_dword v0, v[3:4]
14722; GFX8-NEXT:    s_mov_b64 s[6:7], 0
14723; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
14724; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14725; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
14726; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
14727; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14728; GFX8-NEXT:    v_mov_b32_e32 v6, v0
14729; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
14730; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14731; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
14732; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
14733; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
14734; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
14735; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
14736; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
14737; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
14738; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
14739; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14740; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14741; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
14742; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
14743; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14744; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
14745; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
14746; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
14747; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
14748; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14749; GFX8-NEXT:    buffer_wbinvl1
14750; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
14751; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
14752; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
14753; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
14754; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
14755; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
14756; GFX8-NEXT:    s_setpc_b64 s[30:31]
14757;
14758; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
14759; GFX7:       ; %bb.0:
14760; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14761; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
14762; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
14763; GFX7-NEXT:    flat_load_dword v0, v[4:5]
14764; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
14765; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
14766; GFX7-NEXT:    s_mov_b64 s[4:5], 0
14767; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
14768; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14769; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14770; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
14771; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
14772; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
14773; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
14774; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
14775; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
14776; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
14777; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
14778; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
14779; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
14780; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
14781; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
14782; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
14783; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
14784; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
14785; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14786; GFX7-NEXT:    buffer_wbinvl1
14787; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
14788; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
14789; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14790; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
14791; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14792; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
14793; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
14794; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
14795; GFX7-NEXT:    s_setpc_b64 s[30:31]
14796  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
14797  %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
14798  ret <2 x bfloat> %result
14799}
14800
14801define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 {
14802; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
14803; GFX12:       ; %bb.0:
14804; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14805; GFX12-NEXT:    s_wait_expcnt 0x0
14806; GFX12-NEXT:    s_wait_samplecnt 0x0
14807; GFX12-NEXT:    s_wait_bvhcnt 0x0
14808; GFX12-NEXT:    s_wait_kmcnt 0x0
14809; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
14810; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14811; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
14812; GFX12-NEXT:    s_mov_b32 s1, 0
14813; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
14814; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
14815; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14816; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
14817; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
14818; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14819; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v4
14820; GFX12-NEXT:    v_sub_f32_e32 v6, v6, v5
14821; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14822; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
14823; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
14824; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
14825; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
14826; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
14827; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
14828; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
14829; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
14830; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14831; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
14832; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
14833; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14834; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
14835; GFX12-NEXT:    s_wait_storecnt 0x0
14836; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
14837; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14838; GFX12-NEXT:    global_inv scope:SCOPE_DEV
14839; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
14840; GFX12-NEXT:    v_mov_b32_e32 v3, v2
14841; GFX12-NEXT:    s_wait_alu 0xfffe
14842; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
14843; GFX12-NEXT:    s_wait_alu 0xfffe
14844; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
14845; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
14846; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
14847; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14848; GFX12-NEXT:    s_wait_alu 0xfffe
14849; GFX12-NEXT:    s_setpc_b64 s[30:31]
14850;
14851; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
14852; GFX940:       ; %bb.0:
14853; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14854; GFX940-NEXT:    flat_load_dword v3, v[0:1]
14855; GFX940-NEXT:    s_mov_b64 s[2:3], 0
14856; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14857; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
14858; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
14859; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
14860; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
14861; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
14862; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14863; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
14864; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
14865; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v4
14866; GFX940-NEXT:    v_sub_f32_e32 v6, v6, v5
14867; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
14868; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
14869; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
14870; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
14871; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
14872; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
14873; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
14874; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
14875; GFX940-NEXT:    s_nop 0
14876; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
14877; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
14878; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
14879; GFX940-NEXT:    buffer_wbl2 sc1
14880; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
14881; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14882; GFX940-NEXT:    buffer_inv sc1
14883; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
14884; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
14885; GFX940-NEXT:    v_mov_b32_e32 v3, v2
14886; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
14887; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
14888; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
14889; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
14890; GFX940-NEXT:    s_setpc_b64 s[30:31]
14891;
14892; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
14893; GFX11:       ; %bb.0:
14894; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14895; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
14896; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14897; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
14898; GFX11-NEXT:    s_mov_b32 s1, 0
14899; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
14900; GFX11-NEXT:    .p2align 6
14901; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
14902; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
14903; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14904; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
14905; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
14906; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14907; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v4
14908; GFX11-NEXT:    v_sub_f32_e32 v6, v6, v5
14909; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14910; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
14911; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
14912; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
14913; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
14914; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
14915; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
14916; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
14917; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
14918; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14919; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
14920; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
14921; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14922; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
14923; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
14924; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
14925; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14926; GFX11-NEXT:    buffer_gl1_inv
14927; GFX11-NEXT:    buffer_gl0_inv
14928; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
14929; GFX11-NEXT:    v_mov_b32_e32 v3, v2
14930; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
14931; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
14932; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
14933; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
14934; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
14935; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
14936; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14937; GFX11-NEXT:    s_setpc_b64 s[30:31]
14938;
14939; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
14940; GFX10:       ; %bb.0:
14941; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14942; GFX10-NEXT:    flat_load_dword v3, v[0:1]
14943; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14944; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
14945; GFX10-NEXT:    s_mov_b32 s5, 0
14946; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
14947; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
14948; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14949; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
14950; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
14951; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v4
14952; GFX10-NEXT:    v_sub_f32_e32 v6, v6, v5
14953; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
14954; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
14955; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
14956; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
14957; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
14958; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
14959; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
14960; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
14961; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
14962; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
14963; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
14964; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
14965; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
14966; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14967; GFX10-NEXT:    buffer_gl1_inv
14968; GFX10-NEXT:    buffer_gl0_inv
14969; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
14970; GFX10-NEXT:    v_mov_b32_e32 v3, v2
14971; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
14972; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
14973; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
14974; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
14975; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
14976; GFX10-NEXT:    s_setpc_b64 s[30:31]
14977;
14978; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
14979; GFX90A:       ; %bb.0:
14980; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14981; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
14982; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
14983; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14984; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
14985; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
14986; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
14987; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
14988; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
14989; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14990; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
14991; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
14992; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v4
14993; GFX90A-NEXT:    v_sub_f32_e32 v6, v6, v5
14994; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
14995; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
14996; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
14997; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
14998; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
14999; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
15000; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15001; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
15002; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
15003; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15004; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
15005; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15006; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15007; GFX90A-NEXT:    buffer_wbinvl1
15008; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15009; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15010; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
15011; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15012; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
15013; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
15014; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
15015; GFX90A-NEXT:    s_setpc_b64 s[30:31]
15016;
15017; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
15018; GFX908:       ; %bb.0:
15019; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15020; GFX908-NEXT:    flat_load_dword v3, v[0:1]
15021; GFX908-NEXT:    s_mov_b64 s[6:7], 0
15022; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15023; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
15024; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15025; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
15026; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
15027; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
15028; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15029; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15030; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15031; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v4
15032; GFX908-NEXT:    v_sub_f32_e32 v6, v6, v5
15033; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
15034; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
15035; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15036; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15037; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
15038; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
15039; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15040; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
15041; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
15042; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15043; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
15044; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15045; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15046; GFX908-NEXT:    buffer_wbinvl1
15047; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15048; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15049; GFX908-NEXT:    v_mov_b32_e32 v3, v2
15050; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15051; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
15052; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
15053; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
15054; GFX908-NEXT:    s_setpc_b64 s[30:31]
15055;
15056; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
15057; GFX8:       ; %bb.0:
15058; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15059; GFX8-NEXT:    flat_load_dword v3, v[0:1]
15060; GFX8-NEXT:    s_mov_b64 s[6:7], 0
15061; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15062; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15063; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
15064; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
15065; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15066; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15067; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15068; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v4
15069; GFX8-NEXT:    v_sub_f32_e32 v6, v6, v5
15070; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
15071; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
15072; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
15073; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
15074; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
15075; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
15076; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15077; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15078; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15079; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
15080; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15081; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
15082; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
15083; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
15084; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15085; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15086; GFX8-NEXT:    buffer_wbinvl1
15087; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15088; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15089; GFX8-NEXT:    v_mov_b32_e32 v3, v2
15090; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15091; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
15092; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
15093; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
15094; GFX8-NEXT:    s_setpc_b64 s[30:31]
15095;
15096; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
15097; GFX7:       ; %bb.0:
15098; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15099; GFX7-NEXT:    flat_load_dword v5, v[0:1]
15100; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
15101; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
15102; GFX7-NEXT:    s_mov_b64 s[4:5], 0
15103; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15104; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
15105; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15106; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
15107; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
15108; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
15109; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
15110; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
15111; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
15112; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
15113; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
15114; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
15115; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
15116; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
15117; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
15118; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
15119; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
15120; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
15121; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15122; GFX7-NEXT:    buffer_wbinvl1
15123; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
15124; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
15125; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15126; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
15127; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15128; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
15129; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
15130; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
15131; GFX7-NEXT:    s_setpc_b64 s[30:31]
15132  %unused = atomicrmw fsub ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
15133  ret void
15134}
15135
15136define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
15137; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
15138; GFX12:       ; %bb.0:
15139; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15140; GFX12-NEXT:    s_wait_expcnt 0x0
15141; GFX12-NEXT:    s_wait_samplecnt 0x0
15142; GFX12-NEXT:    s_wait_bvhcnt 0x0
15143; GFX12-NEXT:    s_wait_kmcnt 0x0
15144; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
15145; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15146; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15147; GFX12-NEXT:    s_mov_b32 s1, 0
15148; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
15149; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
15150; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15151; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15152; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15153; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15154; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v4
15155; GFX12-NEXT:    v_sub_f32_e32 v6, v6, v5
15156; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15157; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
15158; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
15159; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
15160; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15161; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15162; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
15163; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
15164; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
15165; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15166; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
15167; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
15168; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15169; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
15170; GFX12-NEXT:    s_wait_storecnt 0x0
15171; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
15172; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15173; GFX12-NEXT:    global_inv scope:SCOPE_DEV
15174; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
15175; GFX12-NEXT:    v_mov_b32_e32 v3, v2
15176; GFX12-NEXT:    s_wait_alu 0xfffe
15177; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
15178; GFX12-NEXT:    s_wait_alu 0xfffe
15179; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
15180; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
15181; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
15182; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
15183; GFX12-NEXT:    s_wait_alu 0xfffe
15184; GFX12-NEXT:    s_setpc_b64 s[30:31]
15185;
15186; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
15187; GFX940:       ; %bb.0:
15188; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15189; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
15190; GFX940-NEXT:    s_mov_b64 s[2:3], 0
15191; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15192; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
15193; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15194; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
15195; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
15196; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
15197; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15198; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15199; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15200; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v4
15201; GFX940-NEXT:    v_sub_f32_e32 v6, v6, v5
15202; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
15203; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
15204; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15205; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15206; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
15207; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
15208; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15209; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
15210; GFX940-NEXT:    s_nop 0
15211; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15212; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
15213; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
15214; GFX940-NEXT:    buffer_wbl2 sc1
15215; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
15216; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15217; GFX940-NEXT:    buffer_inv sc1
15218; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15219; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
15220; GFX940-NEXT:    v_mov_b32_e32 v3, v2
15221; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
15222; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
15223; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
15224; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
15225; GFX940-NEXT:    s_setpc_b64 s[30:31]
15226;
15227; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
15228; GFX11:       ; %bb.0:
15229; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15230; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
15231; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15232; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15233; GFX11-NEXT:    s_mov_b32 s1, 0
15234; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
15235; GFX11-NEXT:    .p2align 6
15236; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
15237; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
15238; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15239; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15240; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15241; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15242; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v4
15243; GFX11-NEXT:    v_sub_f32_e32 v6, v6, v5
15244; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15245; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
15246; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
15247; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
15248; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15249; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15250; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
15251; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
15252; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
15253; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15254; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
15255; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
15256; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15257; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
15258; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
15259; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
15260; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15261; GFX11-NEXT:    buffer_gl1_inv
15262; GFX11-NEXT:    buffer_gl0_inv
15263; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
15264; GFX11-NEXT:    v_mov_b32_e32 v3, v2
15265; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
15266; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15267; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
15268; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
15269; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
15270; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
15271; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
15272; GFX11-NEXT:    s_setpc_b64 s[30:31]
15273;
15274; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
15275; GFX10:       ; %bb.0:
15276; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15277; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
15278; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
15279; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15280; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15281; GFX10-NEXT:    s_mov_b32 s5, 0
15282; GFX10-NEXT:    flat_load_dword v3, v[0:1]
15283; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
15284; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
15285; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15286; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15287; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15288; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v4
15289; GFX10-NEXT:    v_sub_f32_e32 v6, v6, v5
15290; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
15291; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
15292; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
15293; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15294; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15295; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
15296; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
15297; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
15298; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
15299; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
15300; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
15301; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
15302; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15303; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15304; GFX10-NEXT:    buffer_gl1_inv
15305; GFX10-NEXT:    buffer_gl0_inv
15306; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
15307; GFX10-NEXT:    v_mov_b32_e32 v3, v2
15308; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
15309; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
15310; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
15311; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
15312; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
15313; GFX10-NEXT:    s_setpc_b64 s[30:31]
15314;
15315; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
15316; GFX90A:       ; %bb.0:
15317; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15318; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
15319; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
15320; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15321; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
15322; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15323; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
15324; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
15325; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
15326; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15327; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15328; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15329; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v4
15330; GFX90A-NEXT:    v_sub_f32_e32 v6, v6, v5
15331; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
15332; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
15333; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15334; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15335; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
15336; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
15337; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15338; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
15339; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
15340; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15341; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
15342; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
15343; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15344; GFX90A-NEXT:    buffer_wbinvl1
15345; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15346; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15347; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
15348; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15349; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
15350; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
15351; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
15352; GFX90A-NEXT:    s_setpc_b64 s[30:31]
15353;
15354; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
15355; GFX908:       ; %bb.0:
15356; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15357; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
15358; GFX908-NEXT:    s_mov_b64 s[6:7], 0
15359; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15360; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
15361; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15362; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
15363; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
15364; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
15365; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15366; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15367; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15368; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v4
15369; GFX908-NEXT:    v_sub_f32_e32 v6, v6, v5
15370; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
15371; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
15372; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15373; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15374; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
15375; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
15376; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15377; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
15378; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
15379; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15380; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
15381; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
15382; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15383; GFX908-NEXT:    buffer_wbinvl1
15384; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15385; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15386; GFX908-NEXT:    v_mov_b32_e32 v3, v2
15387; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15388; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
15389; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
15390; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
15391; GFX908-NEXT:    s_setpc_b64 s[30:31]
15392;
15393; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
15394; GFX8:       ; %bb.0:
15395; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15396; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
15397; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
15398; GFX8-NEXT:    flat_load_dword v3, v[0:1]
15399; GFX8-NEXT:    s_mov_b64 s[6:7], 0
15400; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15401; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15402; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
15403; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
15404; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15405; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15406; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15407; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v4
15408; GFX8-NEXT:    v_sub_f32_e32 v6, v6, v5
15409; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
15410; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
15411; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
15412; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
15413; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
15414; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
15415; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15416; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15417; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15418; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
15419; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15420; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
15421; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
15422; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
15423; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15424; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15425; GFX8-NEXT:    buffer_wbinvl1
15426; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15427; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15428; GFX8-NEXT:    v_mov_b32_e32 v3, v2
15429; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15430; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
15431; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
15432; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
15433; GFX8-NEXT:    s_setpc_b64 s[30:31]
15434;
15435; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
15436; GFX7:       ; %bb.0:
15437; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15438; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
15439; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
15440; GFX7-NEXT:    flat_load_dword v5, v[0:1]
15441; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
15442; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
15443; GFX7-NEXT:    s_mov_b64 s[4:5], 0
15444; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15445; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
15446; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15447; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
15448; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
15449; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
15450; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
15451; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
15452; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
15453; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
15454; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
15455; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
15456; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
15457; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
15458; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
15459; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
15460; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
15461; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
15462; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15463; GFX7-NEXT:    buffer_wbinvl1
15464; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
15465; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
15466; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15467; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
15468; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15469; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
15470; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
15471; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
15472; GFX7-NEXT:    s_setpc_b64 s[30:31]
15473  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
15474  %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
15475  ret void
15476}
15477
15478define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 {
15479; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
15480; GFX12:       ; %bb.0:
15481; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15482; GFX12-NEXT:    s_wait_expcnt 0x0
15483; GFX12-NEXT:    s_wait_samplecnt 0x0
15484; GFX12-NEXT:    s_wait_bvhcnt 0x0
15485; GFX12-NEXT:    s_wait_kmcnt 0x0
15486; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
15487; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15488; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15489; GFX12-NEXT:    s_mov_b32 s1, 0
15490; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
15491; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
15492; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15493; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15494; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15495; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15496; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v4
15497; GFX12-NEXT:    v_sub_f32_e32 v6, v6, v5
15498; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15499; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
15500; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
15501; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
15502; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15503; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15504; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
15505; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
15506; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
15507; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15508; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
15509; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
15510; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15511; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
15512; GFX12-NEXT:    s_wait_storecnt 0x0
15513; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
15514; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15515; GFX12-NEXT:    global_inv scope:SCOPE_DEV
15516; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
15517; GFX12-NEXT:    v_mov_b32_e32 v3, v2
15518; GFX12-NEXT:    s_wait_alu 0xfffe
15519; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
15520; GFX12-NEXT:    s_wait_alu 0xfffe
15521; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
15522; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
15523; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
15524; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
15525; GFX12-NEXT:    s_wait_alu 0xfffe
15526; GFX12-NEXT:    s_setpc_b64 s[30:31]
15527;
15528; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
15529; GFX940:       ; %bb.0:
15530; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15531; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
15532; GFX940-NEXT:    s_movk_i32 s0, 0xf800
15533; GFX940-NEXT:    s_nop 0
15534; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
15535; GFX940-NEXT:    flat_load_dword v3, v[4:5]
15536; GFX940-NEXT:    s_mov_b32 s1, -1
15537; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
15538; GFX940-NEXT:    s_mov_b64 s[2:3], 0
15539; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15540; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
15541; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15542; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
15543; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
15544; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
15545; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15546; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15547; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15548; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v4
15549; GFX940-NEXT:    v_sub_f32_e32 v6, v6, v5
15550; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
15551; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
15552; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15553; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15554; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
15555; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
15556; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15557; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
15558; GFX940-NEXT:    s_nop 0
15559; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15560; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
15561; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
15562; GFX940-NEXT:    buffer_wbl2 sc1
15563; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
15564; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15565; GFX940-NEXT:    buffer_inv sc1
15566; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15567; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
15568; GFX940-NEXT:    v_mov_b32_e32 v3, v2
15569; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
15570; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
15571; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
15572; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
15573; GFX940-NEXT:    s_setpc_b64 s[30:31]
15574;
15575; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
15576; GFX11:       ; %bb.0:
15577; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15578; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
15579; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
15580; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
15581; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
15582; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
15583; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15584; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15585; GFX11-NEXT:    s_mov_b32 s1, 0
15586; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
15587; GFX11-NEXT:    .p2align 6
15588; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
15589; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
15590; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15591; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15592; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15593; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15594; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v4
15595; GFX11-NEXT:    v_sub_f32_e32 v6, v6, v5
15596; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15597; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
15598; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
15599; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
15600; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15601; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15602; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
15603; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
15604; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
15605; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15606; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
15607; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
15608; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15609; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
15610; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
15611; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
15612; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15613; GFX11-NEXT:    buffer_gl1_inv
15614; GFX11-NEXT:    buffer_gl0_inv
15615; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
15616; GFX11-NEXT:    v_mov_b32_e32 v3, v2
15617; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
15618; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15619; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
15620; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
15621; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
15622; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
15623; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
15624; GFX11-NEXT:    s_setpc_b64 s[30:31]
15625;
15626; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
15627; GFX10:       ; %bb.0:
15628; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15629; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
15630; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
15631; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15632; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15633; GFX10-NEXT:    s_mov_b32 s5, 0
15634; GFX10-NEXT:    flat_load_dword v3, v[0:1]
15635; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
15636; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
15637; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15638; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15639; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15640; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v4
15641; GFX10-NEXT:    v_sub_f32_e32 v6, v6, v5
15642; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
15643; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
15644; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
15645; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15646; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15647; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
15648; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
15649; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
15650; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
15651; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
15652; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
15653; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
15654; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15655; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15656; GFX10-NEXT:    buffer_gl1_inv
15657; GFX10-NEXT:    buffer_gl0_inv
15658; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
15659; GFX10-NEXT:    v_mov_b32_e32 v3, v2
15660; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
15661; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
15662; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
15663; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
15664; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
15665; GFX10-NEXT:    s_setpc_b64 s[30:31]
15666;
15667; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
15668; GFX90A:       ; %bb.0:
15669; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15670; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
15671; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
15672; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
15673; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
15674; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
15675; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
15676; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
15677; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
15678; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15679; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
15680; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
15681; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
15682; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15683; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
15684; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
15685; GFX90A-NEXT:    v_sub_f32_e32 v0, v0, v3
15686; GFX90A-NEXT:    v_sub_f32_e32 v6, v6, v2
15687; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
15688; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
15689; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
15690; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15691; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s8
15692; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
15693; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15694; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
15695; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
15696; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15697; GFX90A-NEXT:    v_perm_b32 v0, v6, v0, s9
15698; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
15699; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15700; GFX90A-NEXT:    buffer_wbinvl1
15701; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
15702; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15703; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
15704; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15705; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
15706; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
15707; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
15708; GFX90A-NEXT:    s_setpc_b64 s[30:31]
15709;
15710; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
15711; GFX908:       ; %bb.0:
15712; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15713; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
15714; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
15715; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
15716; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
15717; GFX908-NEXT:    flat_load_dword v1, v[0:1]
15718; GFX908-NEXT:    s_mov_b64 s[6:7], 0
15719; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
15720; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
15721; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15722; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
15723; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
15724; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
15725; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15726; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
15727; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
15728; GFX908-NEXT:    v_sub_f32_e32 v0, v0, v5
15729; GFX908-NEXT:    v_sub_f32_e32 v6, v6, v2
15730; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
15731; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
15732; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
15733; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15734; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
15735; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
15736; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15737; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
15738; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
15739; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15740; GFX908-NEXT:    v_perm_b32 v0, v6, v0, s9
15741; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
15742; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15743; GFX908-NEXT:    buffer_wbinvl1
15744; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
15745; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15746; GFX908-NEXT:    v_mov_b32_e32 v1, v0
15747; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15748; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
15749; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
15750; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
15751; GFX908-NEXT:    s_setpc_b64 s[30:31]
15752;
15753; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
15754; GFX8:       ; %bb.0:
15755; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15756; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
15757; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
15758; GFX8-NEXT:    flat_load_dword v3, v[0:1]
15759; GFX8-NEXT:    s_mov_b64 s[6:7], 0
15760; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15761; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15762; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
15763; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
15764; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15765; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15766; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15767; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v4
15768; GFX8-NEXT:    v_sub_f32_e32 v6, v6, v5
15769; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
15770; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
15771; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
15772; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
15773; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
15774; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
15775; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15776; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15777; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15778; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
15779; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15780; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
15781; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
15782; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
15783; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15784; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15785; GFX8-NEXT:    buffer_wbinvl1
15786; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15787; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15788; GFX8-NEXT:    v_mov_b32_e32 v3, v2
15789; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15790; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
15791; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
15792; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
15793; GFX8-NEXT:    s_setpc_b64 s[30:31]
15794;
15795; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
15796; GFX7:       ; %bb.0:
15797; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15798; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
15799; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
15800; GFX7-NEXT:    flat_load_dword v5, v[0:1]
15801; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
15802; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
15803; GFX7-NEXT:    s_mov_b64 s[4:5], 0
15804; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15805; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
15806; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15807; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
15808; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
15809; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
15810; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
15811; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
15812; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
15813; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
15814; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
15815; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
15816; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
15817; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
15818; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
15819; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
15820; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
15821; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
15822; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15823; GFX7-NEXT:    buffer_wbinvl1
15824; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
15825; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
15826; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15827; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
15828; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15829; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
15830; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
15831; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
15832; GFX7-NEXT:    s_setpc_b64 s[30:31]
15833  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
15834  %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
15835  ret void
15836}
15837
15838define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
15839; GFX12-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
15840; GFX12:       ; %bb.0:
15841; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15842; GFX12-NEXT:    s_wait_expcnt 0x0
15843; GFX12-NEXT:    s_wait_samplecnt 0x0
15844; GFX12-NEXT:    s_wait_bvhcnt 0x0
15845; GFX12-NEXT:    s_wait_kmcnt 0x0
15846; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
15847; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15848; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15849; GFX12-NEXT:    s_mov_b32 s1, 0
15850; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
15851; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
15852; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15853; GFX12-NEXT:    v_mov_b32_e32 v6, v3
15854; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
15855; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
15856; GFX12-NEXT:    v_sub_f32_e32 v5, v5, v2
15857; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
15858; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15859; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
15860; GFX12-NEXT:    v_sub_f32_e32 v3, v3, v4
15861; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
15862; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
15863; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15864; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
15865; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
15866; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
15867; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
15868; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15869; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
15870; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
15871; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
15872; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
15873; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
15874; GFX12-NEXT:    global_wb scope:SCOPE_SYS
15875; GFX12-NEXT:    s_wait_storecnt 0x0
15876; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
15877; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15878; GFX12-NEXT:    global_inv scope:SCOPE_SYS
15879; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
15880; GFX12-NEXT:    s_wait_alu 0xfffe
15881; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
15882; GFX12-NEXT:    s_wait_alu 0xfffe
15883; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
15884; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
15885; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
15886; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
15887; GFX12-NEXT:    v_mov_b32_e32 v0, v3
15888; GFX12-NEXT:    s_wait_alu 0xfffe
15889; GFX12-NEXT:    s_setpc_b64 s[30:31]
15890;
15891; GFX940-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
15892; GFX940:       ; %bb.0:
15893; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15894; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
15895; GFX940-NEXT:    s_mov_b64 s[2:3], 0
15896; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15897; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
15898; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15899; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
15900; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
15901; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
15902; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15903; GFX940-NEXT:    v_mov_b32_e32 v7, v3
15904; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
15905; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
15906; GFX940-NEXT:    v_sub_f32_e32 v3, v3, v4
15907; GFX940-NEXT:    v_sub_f32_e32 v5, v5, v2
15908; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
15909; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
15910; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
15911; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
15912; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
15913; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
15914; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
15915; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
15916; GFX940-NEXT:    s_nop 0
15917; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
15918; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
15919; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
15920; GFX940-NEXT:    buffer_wbl2 sc0 sc1
15921; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1
15922; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15923; GFX940-NEXT:    buffer_inv sc0 sc1
15924; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
15925; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
15926; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
15927; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
15928; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
15929; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
15930; GFX940-NEXT:    v_mov_b32_e32 v0, v3
15931; GFX940-NEXT:    s_setpc_b64 s[30:31]
15932;
15933; GFX11-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
15934; GFX11:       ; %bb.0:
15935; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15936; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
15937; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15938; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15939; GFX11-NEXT:    s_mov_b32 s1, 0
15940; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
15941; GFX11-NEXT:    .p2align 6
15942; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
15943; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
15944; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15945; GFX11-NEXT:    v_mov_b32_e32 v6, v3
15946; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
15947; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
15948; GFX11-NEXT:    v_sub_f32_e32 v5, v5, v2
15949; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
15950; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15951; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
15952; GFX11-NEXT:    v_sub_f32_e32 v3, v3, v4
15953; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
15954; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
15955; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15956; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
15957; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
15958; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
15959; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
15960; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15961; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
15962; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
15963; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
15964; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
15965; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
15966; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
15967; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
15968; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15969; GFX11-NEXT:    buffer_gl1_inv
15970; GFX11-NEXT:    buffer_gl0_inv
15971; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
15972; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
15973; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15974; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
15975; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
15976; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
15977; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
15978; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
15979; GFX11-NEXT:    v_mov_b32_e32 v0, v3
15980; GFX11-NEXT:    s_setpc_b64 s[30:31]
15981;
15982; GFX10-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
15983; GFX10:       ; %bb.0:
15984; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15985; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
15986; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
15987; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
15988; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15989; GFX10-NEXT:    s_mov_b32 s5, 0
15990; GFX10-NEXT:    flat_load_dword v0, v[3:4]
15991; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
15992; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
15993; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15994; GFX10-NEXT:    v_mov_b32_e32 v6, v0
15995; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
15996; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
15997; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
15998; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v2
15999; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
16000; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
16001; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
16002; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16003; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
16004; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
16005; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
16006; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
16007; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
16008; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
16009; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
16010; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
16011; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
16012; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16013; GFX10-NEXT:    buffer_gl1_inv
16014; GFX10-NEXT:    buffer_gl0_inv
16015; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
16016; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
16017; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
16018; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
16019; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
16020; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
16021; GFX10-NEXT:    s_setpc_b64 s[30:31]
16022;
16023; GFX90A-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
16024; GFX90A:       ; %bb.0:
16025; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16026; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
16027; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
16028; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16029; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
16030; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16031; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
16032; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
16033; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
16034; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16035; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
16036; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
16037; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
16038; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v4
16039; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v2
16040; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
16041; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
16042; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
16043; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16044; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
16045; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
16046; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16047; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
16048; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
16049; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
16050; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
16051; GFX90A-NEXT:    buffer_wbl2
16052; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
16053; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16054; GFX90A-NEXT:    buffer_invl2
16055; GFX90A-NEXT:    buffer_wbinvl1
16056; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
16057; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16058; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16059; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
16060; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
16061; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
16062; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
16063; GFX90A-NEXT:    s_setpc_b64 s[30:31]
16064;
16065; GFX908-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
16066; GFX908:       ; %bb.0:
16067; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16068; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
16069; GFX908-NEXT:    s_mov_b64 s[6:7], 0
16070; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16071; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
16072; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16073; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
16074; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
16075; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
16076; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16077; GFX908-NEXT:    v_mov_b32_e32 v6, v3
16078; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
16079; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16080; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v4
16081; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v2
16082; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
16083; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
16084; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
16085; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16086; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
16087; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
16088; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16089; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
16090; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
16091; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
16092; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
16093; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
16094; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16095; GFX908-NEXT:    buffer_wbinvl1
16096; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
16097; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16098; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16099; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
16100; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
16101; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
16102; GFX908-NEXT:    v_mov_b32_e32 v0, v3
16103; GFX908-NEXT:    s_setpc_b64 s[30:31]
16104;
16105; GFX8-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
16106; GFX8:       ; %bb.0:
16107; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16108; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
16109; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
16110; GFX8-NEXT:    flat_load_dword v0, v[3:4]
16111; GFX8-NEXT:    s_mov_b64 s[6:7], 0
16112; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
16113; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16114; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
16115; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
16116; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16117; GFX8-NEXT:    v_mov_b32_e32 v6, v0
16118; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
16119; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16120; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
16121; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v2
16122; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
16123; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
16124; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
16125; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
16126; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
16127; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
16128; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16129; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16130; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
16131; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
16132; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
16133; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
16134; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
16135; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
16136; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
16137; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16138; GFX8-NEXT:    buffer_wbinvl1
16139; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
16140; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16141; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16142; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
16143; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
16144; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
16145; GFX8-NEXT:    s_setpc_b64 s[30:31]
16146;
16147; GFX7-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
16148; GFX7:       ; %bb.0:
16149; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16150; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
16151; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
16152; GFX7-NEXT:    flat_load_dword v0, v[4:5]
16153; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
16154; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
16155; GFX7-NEXT:    s_mov_b64 s[4:5], 0
16156; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
16157; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
16158; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16159; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
16160; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
16161; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
16162; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
16163; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
16164; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
16165; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
16166; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
16167; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
16168; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
16169; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
16170; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
16171; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
16172; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
16173; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
16174; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16175; GFX7-NEXT:    buffer_wbinvl1
16176; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
16177; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
16178; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16179; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
16180; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16181; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
16182; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
16183; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
16184; GFX7-NEXT:    s_setpc_b64 s[30:31]
16185  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
16186  %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val seq_cst
16187  ret <2 x bfloat> %result
16188}
16189
16190define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
16191; GFX12-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
16192; GFX12:       ; %bb.0:
16193; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16194; GFX12-NEXT:    s_wait_expcnt 0x0
16195; GFX12-NEXT:    s_wait_samplecnt 0x0
16196; GFX12-NEXT:    s_wait_bvhcnt 0x0
16197; GFX12-NEXT:    s_wait_kmcnt 0x0
16198; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
16199; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16200; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16201; GFX12-NEXT:    s_mov_b32 s1, 0
16202; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
16203; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
16204; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16205; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16206; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16207; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16208; GFX12-NEXT:    v_sub_f32_e32 v2, v2, v4
16209; GFX12-NEXT:    v_sub_f32_e32 v6, v6, v5
16210; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16211; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
16212; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
16213; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
16214; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16215; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
16216; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
16217; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
16218; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
16219; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16220; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
16221; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
16222; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16223; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
16224; GFX12-NEXT:    global_wb scope:SCOPE_SYS
16225; GFX12-NEXT:    s_wait_storecnt 0x0
16226; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
16227; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16228; GFX12-NEXT:    global_inv scope:SCOPE_SYS
16229; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
16230; GFX12-NEXT:    v_mov_b32_e32 v3, v2
16231; GFX12-NEXT:    s_wait_alu 0xfffe
16232; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
16233; GFX12-NEXT:    s_wait_alu 0xfffe
16234; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
16235; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
16236; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
16237; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
16238; GFX12-NEXT:    s_wait_alu 0xfffe
16239; GFX12-NEXT:    s_setpc_b64 s[30:31]
16240;
16241; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
16242; GFX940:       ; %bb.0:
16243; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16244; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
16245; GFX940-NEXT:    s_mov_b64 s[2:3], 0
16246; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16247; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
16248; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16249; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
16250; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
16251; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
16252; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16253; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16254; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16255; GFX940-NEXT:    v_sub_f32_e32 v2, v2, v4
16256; GFX940-NEXT:    v_sub_f32_e32 v6, v6, v5
16257; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
16258; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
16259; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
16260; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16261; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
16262; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
16263; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16264; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
16265; GFX940-NEXT:    s_nop 0
16266; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
16267; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
16268; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
16269; GFX940-NEXT:    buffer_wbl2 sc0 sc1
16270; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
16271; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16272; GFX940-NEXT:    buffer_inv sc0 sc1
16273; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
16274; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
16275; GFX940-NEXT:    v_mov_b32_e32 v3, v2
16276; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
16277; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
16278; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
16279; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
16280; GFX940-NEXT:    s_setpc_b64 s[30:31]
16281;
16282; GFX11-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
16283; GFX11:       ; %bb.0:
16284; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16285; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
16286; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16287; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16288; GFX11-NEXT:    s_mov_b32 s1, 0
16289; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
16290; GFX11-NEXT:    .p2align 6
16291; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
16292; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
16293; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16294; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16295; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16296; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16297; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v4
16298; GFX11-NEXT:    v_sub_f32_e32 v6, v6, v5
16299; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16300; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
16301; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
16302; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
16303; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16304; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
16305; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
16306; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
16307; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
16308; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16309; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
16310; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
16311; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16312; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
16313; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
16314; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
16315; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16316; GFX11-NEXT:    buffer_gl1_inv
16317; GFX11-NEXT:    buffer_gl0_inv
16318; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
16319; GFX11-NEXT:    v_mov_b32_e32 v3, v2
16320; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
16321; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
16322; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
16323; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
16324; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
16325; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
16326; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
16327; GFX11-NEXT:    s_setpc_b64 s[30:31]
16328;
16329; GFX10-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
16330; GFX10:       ; %bb.0:
16331; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16332; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
16333; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
16334; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16335; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16336; GFX10-NEXT:    s_mov_b32 s5, 0
16337; GFX10-NEXT:    flat_load_dword v3, v[0:1]
16338; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
16339; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
16340; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16341; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16342; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16343; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v4
16344; GFX10-NEXT:    v_sub_f32_e32 v6, v6, v5
16345; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
16346; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
16347; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
16348; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16349; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
16350; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
16351; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
16352; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
16353; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
16354; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
16355; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
16356; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
16357; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16358; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16359; GFX10-NEXT:    buffer_gl1_inv
16360; GFX10-NEXT:    buffer_gl0_inv
16361; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
16362; GFX10-NEXT:    v_mov_b32_e32 v3, v2
16363; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
16364; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
16365; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
16366; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
16367; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
16368; GFX10-NEXT:    s_setpc_b64 s[30:31]
16369;
16370; GFX90A-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
16371; GFX90A:       ; %bb.0:
16372; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16373; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
16374; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
16375; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16376; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
16377; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16378; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
16379; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
16380; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
16381; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16382; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16383; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16384; GFX90A-NEXT:    v_sub_f32_e32 v2, v2, v4
16385; GFX90A-NEXT:    v_sub_f32_e32 v6, v6, v5
16386; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
16387; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
16388; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
16389; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16390; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
16391; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
16392; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16393; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
16394; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
16395; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
16396; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
16397; GFX90A-NEXT:    buffer_wbl2
16398; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
16399; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16400; GFX90A-NEXT:    buffer_invl2
16401; GFX90A-NEXT:    buffer_wbinvl1
16402; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
16403; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16404; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
16405; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16406; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
16407; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
16408; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
16409; GFX90A-NEXT:    s_setpc_b64 s[30:31]
16410;
16411; GFX908-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
16412; GFX908:       ; %bb.0:
16413; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16414; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
16415; GFX908-NEXT:    s_mov_b64 s[6:7], 0
16416; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16417; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
16418; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16419; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
16420; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
16421; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
16422; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16423; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16424; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16425; GFX908-NEXT:    v_sub_f32_e32 v2, v2, v4
16426; GFX908-NEXT:    v_sub_f32_e32 v6, v6, v5
16427; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
16428; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
16429; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
16430; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16431; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
16432; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
16433; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16434; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
16435; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
16436; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
16437; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
16438; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
16439; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16440; GFX908-NEXT:    buffer_wbinvl1
16441; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
16442; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16443; GFX908-NEXT:    v_mov_b32_e32 v3, v2
16444; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16445; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
16446; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
16447; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
16448; GFX908-NEXT:    s_setpc_b64 s[30:31]
16449;
16450; GFX8-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
16451; GFX8:       ; %bb.0:
16452; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16453; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
16454; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
16455; GFX8-NEXT:    flat_load_dword v3, v[0:1]
16456; GFX8-NEXT:    s_mov_b64 s[6:7], 0
16457; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16458; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16459; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
16460; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
16461; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16462; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16463; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16464; GFX8-NEXT:    v_sub_f32_e32 v2, v2, v4
16465; GFX8-NEXT:    v_sub_f32_e32 v6, v6, v5
16466; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
16467; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
16468; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
16469; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
16470; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
16471; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
16472; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16473; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16474; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
16475; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
16476; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
16477; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
16478; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
16479; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
16480; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16481; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16482; GFX8-NEXT:    buffer_wbinvl1
16483; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
16484; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16485; GFX8-NEXT:    v_mov_b32_e32 v3, v2
16486; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16487; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
16488; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
16489; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
16490; GFX8-NEXT:    s_setpc_b64 s[30:31]
16491;
16492; GFX7-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
16493; GFX7:       ; %bb.0:
16494; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16495; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
16496; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
16497; GFX7-NEXT:    flat_load_dword v5, v[0:1]
16498; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
16499; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
16500; GFX7-NEXT:    s_mov_b64 s[4:5], 0
16501; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16502; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
16503; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16504; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
16505; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
16506; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
16507; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
16508; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
16509; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
16510; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
16511; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
16512; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
16513; GFX7-NEXT:    v_sub_f32_e32 v7, v7, v3
16514; GFX7-NEXT:    v_sub_f32_e32 v6, v6, v2
16515; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
16516; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
16517; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
16518; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
16519; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16520; GFX7-NEXT:    buffer_wbinvl1
16521; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
16522; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
16523; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16524; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
16525; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16526; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
16527; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
16528; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
16529; GFX7-NEXT:    s_setpc_b64 s[30:31]
16530  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
16531  %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val seq_cst
16532  ret void
16533}
16534
16535attributes #0 = { nounwind }
16536attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
16537