xref: /llvm-project/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (revision bfd9bc274586b0261e16e22ac50d50586a0152e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
7; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
8; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
9; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
10
11; --------------------------------------------------------------------
12; float
13; --------------------------------------------------------------------
14
15define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
16; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
17; GFX12:       ; %bb.0:
18; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
19; GFX12-NEXT:    s_wait_expcnt 0x0
20; GFX12-NEXT:    s_wait_samplecnt 0x0
21; GFX12-NEXT:    s_wait_bvhcnt 0x0
22; GFX12-NEXT:    s_wait_kmcnt 0x0
23; GFX12-NEXT:    s_wait_storecnt 0x0
24; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
25; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
26; GFX12-NEXT:    global_inv scope:SCOPE_DEV
27; GFX12-NEXT:    s_setpc_b64 s[30:31]
28;
29; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
30; GFX940:       ; %bb.0:
31; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32; GFX940-NEXT:    buffer_wbl2 sc1
33; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
34; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
35; GFX940-NEXT:    buffer_inv sc1
36; GFX940-NEXT:    s_setpc_b64 s[30:31]
37;
38; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
39; GFX11:       ; %bb.0:
40; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
42; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
43; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
44; GFX11-NEXT:    buffer_gl1_inv
45; GFX11-NEXT:    buffer_gl0_inv
46; GFX11-NEXT:    s_setpc_b64 s[30:31]
47;
48; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
49; GFX10:       ; %bb.0:
50; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51; GFX10-NEXT:    flat_load_dword v3, v[0:1]
52; GFX10-NEXT:    s_mov_b32 s4, 0
53; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
54; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
55; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
56; GFX10-NEXT:    v_mov_b32_e32 v4, v3
57; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
58; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
59; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
60; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
61; GFX10-NEXT:    buffer_gl1_inv
62; GFX10-NEXT:    buffer_gl0_inv
63; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
64; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
65; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
66; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
67; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
68; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
69; GFX10-NEXT:    v_mov_b32_e32 v0, v3
70; GFX10-NEXT:    s_setpc_b64 s[30:31]
71;
72; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
73; GFX90A:       ; %bb.0:
74; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
76; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
77; GFX90A-NEXT:    ; implicit-def: $vgpr3
78; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
79; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
80; GFX90A-NEXT:    s_cbranch_execz .LBB0_6
81; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
82; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
83; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
84; GFX90A-NEXT:    ; implicit-def: $vgpr3
85; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
86; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
87; GFX90A-NEXT:    s_cbranch_execz .LBB0_3
88; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
89; GFX90A-NEXT:    global_atomic_add_f32 v3, v[0:1], v2, off glc
90; GFX90A-NEXT:    s_waitcnt vmcnt(0)
91; GFX90A-NEXT:    buffer_wbinvl1
92; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
93; GFX90A-NEXT:    ; implicit-def: $vgpr2
94; GFX90A-NEXT:  .LBB0_3: ; %Flow
95; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
96; GFX90A-NEXT:    s_cbranch_execz .LBB0_5
97; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.private
98; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
99; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
100; GFX90A-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
101; GFX90A-NEXT:    s_waitcnt vmcnt(0)
102; GFX90A-NEXT:    v_add_f32_e32 v1, v3, v2
103; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
104; GFX90A-NEXT:  .LBB0_5: ; %Flow1
105; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
106; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
107; GFX90A-NEXT:    ; implicit-def: $vgpr2
108; GFX90A-NEXT:  .LBB0_6: ; %Flow2
109; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
110; GFX90A-NEXT:    s_cbranch_execz .LBB0_8
111; GFX90A-NEXT:  ; %bb.7: ; %atomicrmw.shared
112; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
113; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
114; GFX90A-NEXT:    ds_add_rtn_f32 v3, v0, v2
115; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
116; GFX90A-NEXT:  .LBB0_8: ; %atomicrmw.phi
117; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
118; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
119; GFX90A-NEXT:    s_waitcnt vmcnt(0)
120; GFX90A-NEXT:    s_setpc_b64 s[30:31]
121;
122; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
123; GFX908:       ; %bb.0:
124; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125; GFX908-NEXT:    flat_load_dword v3, v[0:1]
126; GFX908-NEXT:    s_mov_b64 s[4:5], 0
127; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
128; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
129; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
130; GFX908-NEXT:    v_mov_b32_e32 v4, v3
131; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
132; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
133; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
134; GFX908-NEXT:    buffer_wbinvl1
135; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
136; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
137; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
138; GFX908-NEXT:    s_cbranch_execnz .LBB0_1
139; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
140; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
141; GFX908-NEXT:    v_mov_b32_e32 v0, v3
142; GFX908-NEXT:    s_setpc_b64 s[30:31]
143;
144; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
145; GFX8:       ; %bb.0:
146; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147; GFX8-NEXT:    flat_load_dword v3, v[0:1]
148; GFX8-NEXT:    s_mov_b64 s[4:5], 0
149; GFX8-NEXT:  .LBB0_1: ; %atomicrmw.start
150; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
151; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
152; GFX8-NEXT:    v_mov_b32_e32 v4, v3
153; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
154; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
155; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
156; GFX8-NEXT:    buffer_wbinvl1
157; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
158; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
159; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
160; GFX8-NEXT:    s_cbranch_execnz .LBB0_1
161; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
162; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
163; GFX8-NEXT:    v_mov_b32_e32 v0, v3
164; GFX8-NEXT:    s_setpc_b64 s[30:31]
165;
166; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
167; GFX7:       ; %bb.0:
168; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169; GFX7-NEXT:    flat_load_dword v3, v[0:1]
170; GFX7-NEXT:    s_mov_b64 s[4:5], 0
171; GFX7-NEXT:  .LBB0_1: ; %atomicrmw.start
172; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
173; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
174; GFX7-NEXT:    v_mov_b32_e32 v4, v3
175; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
176; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
177; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
178; GFX7-NEXT:    buffer_wbinvl1
179; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
180; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
181; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
182; GFX7-NEXT:    s_cbranch_execnz .LBB0_1
183; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
184; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
185; GFX7-NEXT:    v_mov_b32_e32 v0, v3
186; GFX7-NEXT:    s_setpc_b64 s[30:31]
187  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
188  ret float %result
189}
190
191define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
192; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
193; GFX12:       ; %bb.0:
194; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
195; GFX12-NEXT:    s_wait_expcnt 0x0
196; GFX12-NEXT:    s_wait_samplecnt 0x0
197; GFX12-NEXT:    s_wait_bvhcnt 0x0
198; GFX12-NEXT:    s_wait_kmcnt 0x0
199; GFX12-NEXT:    s_wait_storecnt 0x0
200; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
201; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
202; GFX12-NEXT:    global_inv scope:SCOPE_DEV
203; GFX12-NEXT:    s_setpc_b64 s[30:31]
204;
205; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
206; GFX940:       ; %bb.0:
207; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
208; GFX940-NEXT:    buffer_wbl2 sc1
209; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0
210; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
211; GFX940-NEXT:    buffer_inv sc1
212; GFX940-NEXT:    s_setpc_b64 s[30:31]
213;
214; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
215; GFX11:       ; %bb.0:
216; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
218; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc
219; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
220; GFX11-NEXT:    buffer_gl1_inv
221; GFX11-NEXT:    buffer_gl0_inv
222; GFX11-NEXT:    s_setpc_b64 s[30:31]
223;
224; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
225; GFX10:       ; %bb.0:
226; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
228; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
229; GFX10-NEXT:    s_mov_b32 s4, 0
230; GFX10-NEXT:    flat_load_dword v0, v[3:4]
231; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
232; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
233; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
234; GFX10-NEXT:    v_mov_b32_e32 v1, v0
235; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
236; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
237; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
238; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
239; GFX10-NEXT:    buffer_gl1_inv
240; GFX10-NEXT:    buffer_gl0_inv
241; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
242; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
243; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
244; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
245; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
246; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
247; GFX10-NEXT:    s_setpc_b64 s[30:31]
248;
249; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
250; GFX90A:       ; %bb.0:
251; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
253; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
254; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
255; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
256; GFX90A-NEXT:    ; implicit-def: $vgpr0
257; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
258; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
259; GFX90A-NEXT:    s_cbranch_execnz .LBB1_3
260; GFX90A-NEXT:  ; %bb.1: ; %Flow2
261; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
262; GFX90A-NEXT:    s_cbranch_execnz .LBB1_8
263; GFX90A-NEXT:  .LBB1_2: ; %atomicrmw.phi
264; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
265; GFX90A-NEXT:    s_waitcnt vmcnt(0)
266; GFX90A-NEXT:    s_setpc_b64 s[30:31]
267; GFX90A-NEXT:  .LBB1_3: ; %atomicrmw.check.private
268; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
269; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v5
270; GFX90A-NEXT:    ; implicit-def: $vgpr0
271; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
272; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
273; GFX90A-NEXT:    s_cbranch_execz .LBB1_5
274; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
275; GFX90A-NEXT:    global_atomic_add_f32 v0, v[4:5], v2, off glc
276; GFX90A-NEXT:    s_waitcnt vmcnt(0)
277; GFX90A-NEXT:    buffer_wbinvl1
278; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
279; GFX90A-NEXT:    ; implicit-def: $vgpr2
280; GFX90A-NEXT:  .LBB1_5: ; %Flow
281; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
282; GFX90A-NEXT:    s_cbranch_execz .LBB1_7
283; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
284; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
285; GFX90A-NEXT:    v_cndmask_b32_e32 v1, -1, v4, vcc
286; GFX90A-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
287; GFX90A-NEXT:    s_waitcnt vmcnt(0)
288; GFX90A-NEXT:    v_add_f32_e32 v2, v0, v2
289; GFX90A-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
290; GFX90A-NEXT:  .LBB1_7: ; %Flow1
291; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
292; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
293; GFX90A-NEXT:    ; implicit-def: $vgpr2
294; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
295; GFX90A-NEXT:    s_cbranch_execz .LBB1_2
296; GFX90A-NEXT:  .LBB1_8: ; %atomicrmw.shared
297; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
298; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc
299; GFX90A-NEXT:    ds_add_rtn_f32 v0, v0, v2
300; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
301; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
302; GFX90A-NEXT:    s_waitcnt vmcnt(0)
303; GFX90A-NEXT:    s_setpc_b64 s[30:31]
304;
305; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
306; GFX908:       ; %bb.0:
307; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
309; GFX908-NEXT:    s_mov_b64 s[4:5], 0
310; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
311; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
312; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
313; GFX908-NEXT:    v_mov_b32_e32 v4, v3
314; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
315; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
316; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
317; GFX908-NEXT:    buffer_wbinvl1
318; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
319; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
320; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
321; GFX908-NEXT:    s_cbranch_execnz .LBB1_1
322; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
323; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
324; GFX908-NEXT:    v_mov_b32_e32 v0, v3
325; GFX908-NEXT:    s_setpc_b64 s[30:31]
326;
327; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
328; GFX8:       ; %bb.0:
329; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
330; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
331; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
332; GFX8-NEXT:    flat_load_dword v0, v[3:4]
333; GFX8-NEXT:    s_mov_b64 s[4:5], 0
334; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
335; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
336; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
337; GFX8-NEXT:    v_mov_b32_e32 v1, v0
338; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
339; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
340; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
341; GFX8-NEXT:    buffer_wbinvl1
342; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
343; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
344; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
345; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
346; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
347; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
348; GFX8-NEXT:    s_setpc_b64 s[30:31]
349;
350; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
351; GFX7:       ; %bb.0:
352; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
353; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
354; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
355; GFX7-NEXT:    flat_load_dword v0, v[3:4]
356; GFX7-NEXT:    s_mov_b64 s[4:5], 0
357; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
358; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
359; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
360; GFX7-NEXT:    v_mov_b32_e32 v1, v0
361; GFX7-NEXT:    v_add_f32_e32 v0, v1, v2
362; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
363; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
364; GFX7-NEXT:    buffer_wbinvl1
365; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
366; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
367; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
368; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
369; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
370; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
371; GFX7-NEXT:    s_setpc_b64 s[30:31]
372  %gep = getelementptr float, ptr %ptr, i64 511
373  %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
374  ret float %result
375}
376
377define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
378; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
379; GFX12:       ; %bb.0:
380; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
381; GFX12-NEXT:    s_wait_expcnt 0x0
382; GFX12-NEXT:    s_wait_samplecnt 0x0
383; GFX12-NEXT:    s_wait_bvhcnt 0x0
384; GFX12-NEXT:    s_wait_kmcnt 0x0
385; GFX12-NEXT:    s_wait_storecnt 0x0
386; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
387; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
388; GFX12-NEXT:    global_inv scope:SCOPE_DEV
389; GFX12-NEXT:    s_setpc_b64 s[30:31]
390;
391; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
392; GFX940:       ; %bb.0:
393; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
394; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
395; GFX940-NEXT:    s_nop 1
396; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
397; GFX940-NEXT:    buffer_wbl2 sc1
398; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
399; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
400; GFX940-NEXT:    buffer_inv sc1
401; GFX940-NEXT:    s_setpc_b64 s[30:31]
402;
403; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
404; GFX11:       ; %bb.0:
405; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
406; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
407; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
408; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
409; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
410; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
411; GFX11-NEXT:    buffer_gl1_inv
412; GFX11-NEXT:    buffer_gl0_inv
413; GFX11-NEXT:    s_setpc_b64 s[30:31]
414;
415; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
416; GFX10:       ; %bb.0:
417; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
418; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
419; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
420; GFX10-NEXT:    s_mov_b32 s4, 0
421; GFX10-NEXT:    flat_load_dword v0, v[3:4]
422; GFX10-NEXT:  .LBB2_1: ; %atomicrmw.start
423; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
424; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
425; GFX10-NEXT:    v_mov_b32_e32 v1, v0
426; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
427; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
428; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
429; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
430; GFX10-NEXT:    buffer_gl1_inv
431; GFX10-NEXT:    buffer_gl0_inv
432; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
433; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
434; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
435; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
436; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
437; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
438; GFX10-NEXT:    s_setpc_b64 s[30:31]
439;
440; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
441; GFX90A:       ; %bb.0:
442; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
443; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
444; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
445; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
446; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
447; GFX90A-NEXT:    ; implicit-def: $vgpr0
448; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
449; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
450; GFX90A-NEXT:    s_cbranch_execnz .LBB2_3
451; GFX90A-NEXT:  ; %bb.1: ; %Flow2
452; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
453; GFX90A-NEXT:    s_cbranch_execnz .LBB2_8
454; GFX90A-NEXT:  .LBB2_2: ; %atomicrmw.phi
455; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
456; GFX90A-NEXT:    s_waitcnt vmcnt(0)
457; GFX90A-NEXT:    s_setpc_b64 s[30:31]
458; GFX90A-NEXT:  .LBB2_3: ; %atomicrmw.check.private
459; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
460; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v5
461; GFX90A-NEXT:    ; implicit-def: $vgpr0
462; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
463; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
464; GFX90A-NEXT:    s_cbranch_execz .LBB2_5
465; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
466; GFX90A-NEXT:    global_atomic_add_f32 v0, v[4:5], v2, off glc
467; GFX90A-NEXT:    s_waitcnt vmcnt(0)
468; GFX90A-NEXT:    buffer_wbinvl1
469; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
470; GFX90A-NEXT:    ; implicit-def: $vgpr2
471; GFX90A-NEXT:  .LBB2_5: ; %Flow
472; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
473; GFX90A-NEXT:    s_cbranch_execz .LBB2_7
474; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
475; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
476; GFX90A-NEXT:    v_cndmask_b32_e32 v1, -1, v4, vcc
477; GFX90A-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
478; GFX90A-NEXT:    s_waitcnt vmcnt(0)
479; GFX90A-NEXT:    v_add_f32_e32 v2, v0, v2
480; GFX90A-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
481; GFX90A-NEXT:  .LBB2_7: ; %Flow1
482; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
483; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
484; GFX90A-NEXT:    ; implicit-def: $vgpr2
485; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
486; GFX90A-NEXT:    s_cbranch_execz .LBB2_2
487; GFX90A-NEXT:  .LBB2_8: ; %atomicrmw.shared
488; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
489; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc
490; GFX90A-NEXT:    ds_add_rtn_f32 v0, v0, v2
491; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
492; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
493; GFX90A-NEXT:    s_waitcnt vmcnt(0)
494; GFX90A-NEXT:    s_setpc_b64 s[30:31]
495;
496; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
497; GFX908:       ; %bb.0:
498; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
499; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
500; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
501; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
502; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
503; GFX908-NEXT:    flat_load_dword v0, v[0:1]
504; GFX908-NEXT:    s_mov_b64 s[4:5], 0
505; GFX908-NEXT:  .LBB2_1: ; %atomicrmw.start
506; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
507; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
508; GFX908-NEXT:    v_mov_b32_e32 v1, v0
509; GFX908-NEXT:    v_add_f32_e32 v0, v1, v2
510; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
511; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
512; GFX908-NEXT:    buffer_wbinvl1
513; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
514; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
515; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
516; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
517; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
518; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
519; GFX908-NEXT:    s_setpc_b64 s[30:31]
520;
521; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
522; GFX8:       ; %bb.0:
523; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
524; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
525; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
526; GFX8-NEXT:    flat_load_dword v0, v[3:4]
527; GFX8-NEXT:    s_mov_b64 s[4:5], 0
528; GFX8-NEXT:  .LBB2_1: ; %atomicrmw.start
529; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
530; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
531; GFX8-NEXT:    v_mov_b32_e32 v1, v0
532; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
533; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
534; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
535; GFX8-NEXT:    buffer_wbinvl1
536; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
537; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
538; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
539; GFX8-NEXT:    s_cbranch_execnz .LBB2_1
540; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
541; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
542; GFX8-NEXT:    s_setpc_b64 s[30:31]
543;
544; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
545; GFX7:       ; %bb.0:
546; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
547; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
548; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
549; GFX7-NEXT:    flat_load_dword v0, v[3:4]
550; GFX7-NEXT:    s_mov_b64 s[4:5], 0
551; GFX7-NEXT:  .LBB2_1: ; %atomicrmw.start
552; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
553; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
554; GFX7-NEXT:    v_mov_b32_e32 v1, v0
555; GFX7-NEXT:    v_add_f32_e32 v0, v1, v2
556; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
557; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
558; GFX7-NEXT:    buffer_wbinvl1
559; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
560; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
561; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
562; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
563; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
564; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
565; GFX7-NEXT:    s_setpc_b64 s[30:31]
566  %gep = getelementptr float, ptr %ptr, i64 -512
567  %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
568  ret float %result
569}
570
571define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
572; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
573; GFX12:       ; %bb.0:
574; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
575; GFX12-NEXT:    s_wait_expcnt 0x0
576; GFX12-NEXT:    s_wait_samplecnt 0x0
577; GFX12-NEXT:    s_wait_bvhcnt 0x0
578; GFX12-NEXT:    s_wait_kmcnt 0x0
579; GFX12-NEXT:    s_wait_storecnt 0x0
580; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
581; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
582; GFX12-NEXT:    global_inv scope:SCOPE_DEV
583; GFX12-NEXT:    s_setpc_b64 s[30:31]
584;
585; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
586; GFX940:       ; %bb.0:
587; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
588; GFX940-NEXT:    buffer_wbl2 sc1
589; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
590; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
591; GFX940-NEXT:    buffer_inv sc1
592; GFX940-NEXT:    s_setpc_b64 s[30:31]
593;
594; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
595; GFX11:       ; %bb.0:
596; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
597; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
598; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
599; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
600; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
601; GFX11-NEXT:    buffer_gl1_inv
602; GFX11-NEXT:    buffer_gl0_inv
603; GFX11-NEXT:    s_setpc_b64 s[30:31]
604;
605; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
606; GFX10:       ; %bb.0:
607; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
608; GFX10-NEXT:    flat_load_dword v4, v[0:1]
609; GFX10-NEXT:    s_mov_b32 s4, 0
610; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
611; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
612; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
613; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
614; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
615; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
616; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
617; GFX10-NEXT:    buffer_gl1_inv
618; GFX10-NEXT:    buffer_gl0_inv
619; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
620; GFX10-NEXT:    v_mov_b32_e32 v4, v3
621; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
622; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
623; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
624; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
625; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
626; GFX10-NEXT:    s_setpc_b64 s[30:31]
627;
628; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
629; GFX90A:       ; %bb.0:
630; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
631; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
632; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
633; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
634; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
635; GFX90A-NEXT:    s_cbranch_execnz .LBB3_3
636; GFX90A-NEXT:  ; %bb.1: ; %Flow2
637; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
638; GFX90A-NEXT:    s_cbranch_execnz .LBB3_8
639; GFX90A-NEXT:  .LBB3_2: ; %atomicrmw.phi
640; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
641; GFX90A-NEXT:    s_waitcnt vmcnt(0)
642; GFX90A-NEXT:    s_setpc_b64 s[30:31]
643; GFX90A-NEXT:  .LBB3_3: ; %atomicrmw.check.private
644; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
645; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
646; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
647; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
648; GFX90A-NEXT:    s_cbranch_execz .LBB3_5
649; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
650; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
651; GFX90A-NEXT:    s_waitcnt vmcnt(0)
652; GFX90A-NEXT:    buffer_wbinvl1
653; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
654; GFX90A-NEXT:    ; implicit-def: $vgpr2
655; GFX90A-NEXT:  .LBB3_5: ; %Flow
656; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
657; GFX90A-NEXT:    s_cbranch_execz .LBB3_7
658; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
659; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
660; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
661; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
662; GFX90A-NEXT:    s_waitcnt vmcnt(0)
663; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
664; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
665; GFX90A-NEXT:  .LBB3_7: ; %Flow1
666; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
667; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
668; GFX90A-NEXT:    ; implicit-def: $vgpr2
669; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
670; GFX90A-NEXT:    s_cbranch_execz .LBB3_2
671; GFX90A-NEXT:  .LBB3_8: ; %atomicrmw.shared
672; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
673; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
674; GFX90A-NEXT:    ds_add_f32 v0, v2
675; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
676; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
677; GFX90A-NEXT:    s_waitcnt vmcnt(0)
678; GFX90A-NEXT:    s_setpc_b64 s[30:31]
679;
680; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
681; GFX908:       ; %bb.0:
682; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
683; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
684; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
685; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
686; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
687; GFX908-NEXT:    s_cbranch_execnz .LBB3_3
688; GFX908-NEXT:  ; %bb.1: ; %Flow2
689; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
690; GFX908-NEXT:    s_cbranch_execnz .LBB3_8
691; GFX908-NEXT:  .LBB3_2: ; %atomicrmw.phi
692; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
693; GFX908-NEXT:    s_waitcnt vmcnt(0)
694; GFX908-NEXT:    s_setpc_b64 s[30:31]
695; GFX908-NEXT:  .LBB3_3: ; %atomicrmw.check.private
696; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
697; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
698; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
699; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
700; GFX908-NEXT:    s_cbranch_execz .LBB3_5
701; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
702; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
703; GFX908-NEXT:    s_waitcnt vmcnt(0)
704; GFX908-NEXT:    buffer_wbinvl1
705; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
706; GFX908-NEXT:    ; implicit-def: $vgpr2
707; GFX908-NEXT:  .LBB3_5: ; %Flow
708; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
709; GFX908-NEXT:    s_cbranch_execz .LBB3_7
710; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
711; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
712; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
713; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
714; GFX908-NEXT:    s_waitcnt vmcnt(0)
715; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
716; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
717; GFX908-NEXT:  .LBB3_7: ; %Flow1
718; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
719; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
720; GFX908-NEXT:    ; implicit-def: $vgpr2
721; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
722; GFX908-NEXT:    s_cbranch_execz .LBB3_2
723; GFX908-NEXT:  .LBB3_8: ; %atomicrmw.shared
724; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
725; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
726; GFX908-NEXT:    ds_add_f32 v0, v2
727; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
728; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
729; GFX908-NEXT:    s_waitcnt vmcnt(0)
730; GFX908-NEXT:    s_setpc_b64 s[30:31]
731;
732; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
733; GFX8:       ; %bb.0:
734; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
735; GFX8-NEXT:    flat_load_dword v4, v[0:1]
736; GFX8-NEXT:    s_mov_b64 s[4:5], 0
737; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
738; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
739; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
740; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
741; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
742; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
743; GFX8-NEXT:    buffer_wbinvl1
744; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
745; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
746; GFX8-NEXT:    v_mov_b32_e32 v4, v3
747; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
748; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
749; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
750; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
751; GFX8-NEXT:    s_setpc_b64 s[30:31]
752;
753; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
754; GFX7:       ; %bb.0:
755; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
756; GFX7-NEXT:    flat_load_dword v4, v[0:1]
757; GFX7-NEXT:    s_mov_b64 s[4:5], 0
758; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
759; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
760; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
761; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
762; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
763; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
764; GFX7-NEXT:    buffer_wbinvl1
765; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
766; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
767; GFX7-NEXT:    v_mov_b32_e32 v4, v3
768; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
769; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
770; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
771; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
772; GFX7-NEXT:    s_setpc_b64 s[30:31]
773  %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
774  ret void
775}
776
777define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
778; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
779; GFX12:       ; %bb.0:
780; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
781; GFX12-NEXT:    s_wait_expcnt 0x0
782; GFX12-NEXT:    s_wait_samplecnt 0x0
783; GFX12-NEXT:    s_wait_bvhcnt 0x0
784; GFX12-NEXT:    s_wait_kmcnt 0x0
785; GFX12-NEXT:    s_wait_storecnt 0x0
786; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
787; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
788; GFX12-NEXT:    global_inv scope:SCOPE_DEV
789; GFX12-NEXT:    s_setpc_b64 s[30:31]
790;
791; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
792; GFX940:       ; %bb.0:
793; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
794; GFX940-NEXT:    buffer_wbl2 sc1
795; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
796; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
797; GFX940-NEXT:    buffer_inv sc1
798; GFX940-NEXT:    s_setpc_b64 s[30:31]
799;
800; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
801; GFX11:       ; %bb.0:
802; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
803; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
804; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
805; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
806; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
807; GFX11-NEXT:    buffer_gl1_inv
808; GFX11-NEXT:    buffer_gl0_inv
809; GFX11-NEXT:    s_setpc_b64 s[30:31]
810;
811; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
812; GFX10:       ; %bb.0:
813; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
814; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
815; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
816; GFX10-NEXT:    s_mov_b32 s4, 0
817; GFX10-NEXT:    flat_load_dword v4, v[0:1]
818; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
819; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
820; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
821; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
822; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
823; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
824; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
825; GFX10-NEXT:    buffer_gl1_inv
826; GFX10-NEXT:    buffer_gl0_inv
827; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
828; GFX10-NEXT:    v_mov_b32_e32 v4, v3
829; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
830; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
831; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
832; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
833; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
834; GFX10-NEXT:    s_setpc_b64 s[30:31]
835;
836; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
837; GFX90A:       ; %bb.0:
838; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
839; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
840; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
841; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
842; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
843; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
844; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
845; GFX90A-NEXT:    s_cbranch_execnz .LBB4_3
846; GFX90A-NEXT:  ; %bb.1: ; %Flow2
847; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
848; GFX90A-NEXT:    s_cbranch_execnz .LBB4_8
849; GFX90A-NEXT:  .LBB4_2: ; %atomicrmw.phi
850; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
851; GFX90A-NEXT:    s_waitcnt vmcnt(0)
852; GFX90A-NEXT:    s_setpc_b64 s[30:31]
853; GFX90A-NEXT:  .LBB4_3: ; %atomicrmw.check.private
854; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
855; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
856; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
857; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
858; GFX90A-NEXT:    s_cbranch_execz .LBB4_5
859; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
860; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
861; GFX90A-NEXT:    s_waitcnt vmcnt(0)
862; GFX90A-NEXT:    buffer_wbinvl1
863; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
864; GFX90A-NEXT:    ; implicit-def: $vgpr2
865; GFX90A-NEXT:  .LBB4_5: ; %Flow
866; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
867; GFX90A-NEXT:    s_cbranch_execz .LBB4_7
868; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
869; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
870; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
871; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
872; GFX90A-NEXT:    s_waitcnt vmcnt(0)
873; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
874; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
875; GFX90A-NEXT:  .LBB4_7: ; %Flow1
876; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
877; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
878; GFX90A-NEXT:    ; implicit-def: $vgpr2
879; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
880; GFX90A-NEXT:    s_cbranch_execz .LBB4_2
881; GFX90A-NEXT:  .LBB4_8: ; %atomicrmw.shared
882; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
883; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
884; GFX90A-NEXT:    ds_add_f32 v0, v2
885; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
886; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
887; GFX90A-NEXT:    s_waitcnt vmcnt(0)
888; GFX90A-NEXT:    s_setpc_b64 s[30:31]
889;
890; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
891; GFX908:       ; %bb.0:
892; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
893; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
894; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
895; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
896; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
897; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
898; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
899; GFX908-NEXT:    s_cbranch_execnz .LBB4_3
900; GFX908-NEXT:  ; %bb.1: ; %Flow2
901; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
902; GFX908-NEXT:    s_cbranch_execnz .LBB4_8
903; GFX908-NEXT:  .LBB4_2: ; %atomicrmw.phi
904; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
905; GFX908-NEXT:    s_waitcnt vmcnt(0)
906; GFX908-NEXT:    s_setpc_b64 s[30:31]
907; GFX908-NEXT:  .LBB4_3: ; %atomicrmw.check.private
908; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
909; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
910; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
911; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
912; GFX908-NEXT:    s_cbranch_execz .LBB4_5
913; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
914; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
915; GFX908-NEXT:    s_waitcnt vmcnt(0)
916; GFX908-NEXT:    buffer_wbinvl1
917; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
918; GFX908-NEXT:    ; implicit-def: $vgpr2
919; GFX908-NEXT:  .LBB4_5: ; %Flow
920; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
921; GFX908-NEXT:    s_cbranch_execz .LBB4_7
922; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
923; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
924; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
925; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
926; GFX908-NEXT:    s_waitcnt vmcnt(0)
927; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
928; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
929; GFX908-NEXT:  .LBB4_7: ; %Flow1
930; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
931; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
932; GFX908-NEXT:    ; implicit-def: $vgpr2
933; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
934; GFX908-NEXT:    s_cbranch_execz .LBB4_2
935; GFX908-NEXT:  .LBB4_8: ; %atomicrmw.shared
936; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
937; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
938; GFX908-NEXT:    ds_add_f32 v0, v2
939; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
940; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
941; GFX908-NEXT:    s_waitcnt vmcnt(0)
942; GFX908-NEXT:    s_setpc_b64 s[30:31]
943;
944; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
945; GFX8:       ; %bb.0:
946; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
947; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
948; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
949; GFX8-NEXT:    flat_load_dword v4, v[0:1]
950; GFX8-NEXT:    s_mov_b64 s[4:5], 0
951; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
952; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
953; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
954; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
955; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
956; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
957; GFX8-NEXT:    buffer_wbinvl1
958; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
959; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
960; GFX8-NEXT:    v_mov_b32_e32 v4, v3
961; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
962; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
963; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
964; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
965; GFX8-NEXT:    s_setpc_b64 s[30:31]
966;
967; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
968; GFX7:       ; %bb.0:
969; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
970; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
971; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
972; GFX7-NEXT:    flat_load_dword v4, v[0:1]
973; GFX7-NEXT:    s_mov_b64 s[4:5], 0
974; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
975; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
976; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
977; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
978; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
979; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
980; GFX7-NEXT:    buffer_wbinvl1
981; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
982; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
983; GFX7-NEXT:    v_mov_b32_e32 v4, v3
984; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
985; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
986; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
987; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
988; GFX7-NEXT:    s_setpc_b64 s[30:31]
989  %gep = getelementptr float, ptr %ptr, i64 511
990  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
991  ret void
992}
993
994define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
995; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
996; GFX12:       ; %bb.0:
997; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
998; GFX12-NEXT:    s_wait_expcnt 0x0
999; GFX12-NEXT:    s_wait_samplecnt 0x0
1000; GFX12-NEXT:    s_wait_bvhcnt 0x0
1001; GFX12-NEXT:    s_wait_kmcnt 0x0
1002; GFX12-NEXT:    s_wait_storecnt 0x0
1003; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
1004; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
1005; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1006; GFX12-NEXT:    s_setpc_b64 s[30:31]
1007;
1008; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1009; GFX940:       ; %bb.0:
1010; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1011; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
1012; GFX940-NEXT:    s_nop 1
1013; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
1014; GFX940-NEXT:    buffer_wbl2 sc1
1015; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
1016; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1017; GFX940-NEXT:    buffer_inv sc1
1018; GFX940-NEXT:    s_setpc_b64 s[30:31]
1019;
1020; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1021; GFX11:       ; %bb.0:
1022; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1023; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
1024; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
1025; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1026; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
1027; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1028; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1029; GFX11-NEXT:    buffer_gl1_inv
1030; GFX11-NEXT:    buffer_gl0_inv
1031; GFX11-NEXT:    s_setpc_b64 s[30:31]
1032;
1033; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1034; GFX10:       ; %bb.0:
1035; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1036; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
1037; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
1038; GFX10-NEXT:    s_mov_b32 s4, 0
1039; GFX10-NEXT:    flat_load_dword v4, v[0:1]
1040; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
1041; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1042; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1043; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
1044; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1045; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1046; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1047; GFX10-NEXT:    buffer_gl1_inv
1048; GFX10-NEXT:    buffer_gl0_inv
1049; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1050; GFX10-NEXT:    v_mov_b32_e32 v4, v3
1051; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1052; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1053; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
1054; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1055; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1056; GFX10-NEXT:    s_setpc_b64 s[30:31]
1057;
1058; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1059; GFX90A:       ; %bb.0:
1060; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1061; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
1062; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
1063; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
1064; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
1065; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1066; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
1067; GFX90A-NEXT:    s_cbranch_execnz .LBB5_3
1068; GFX90A-NEXT:  ; %bb.1: ; %Flow2
1069; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
1070; GFX90A-NEXT:    s_cbranch_execnz .LBB5_8
1071; GFX90A-NEXT:  .LBB5_2: ; %atomicrmw.phi
1072; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1073; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1074; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1075; GFX90A-NEXT:  .LBB5_3: ; %atomicrmw.check.private
1076; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
1077; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
1078; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1079; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
1080; GFX90A-NEXT:    s_cbranch_execz .LBB5_5
1081; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
1082; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
1083; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1084; GFX90A-NEXT:    buffer_wbinvl1
1085; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
1086; GFX90A-NEXT:    ; implicit-def: $vgpr2
1087; GFX90A-NEXT:  .LBB5_5: ; %Flow
1088; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
1089; GFX90A-NEXT:    s_cbranch_execz .LBB5_7
1090; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
1091; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1092; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1093; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
1094; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1095; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
1096; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
1097; GFX90A-NEXT:  .LBB5_7: ; %Flow1
1098; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
1099; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
1100; GFX90A-NEXT:    ; implicit-def: $vgpr2
1101; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
1102; GFX90A-NEXT:    s_cbranch_execz .LBB5_2
1103; GFX90A-NEXT:  .LBB5_8: ; %atomicrmw.shared
1104; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1105; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1106; GFX90A-NEXT:    ds_add_f32 v0, v2
1107; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1108; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1109; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1110; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1111;
1112; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1113; GFX908:       ; %bb.0:
1114; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1115; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
1116; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
1117; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
1118; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
1119; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1120; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
1121; GFX908-NEXT:    s_cbranch_execnz .LBB5_3
1122; GFX908-NEXT:  ; %bb.1: ; %Flow2
1123; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
1124; GFX908-NEXT:    s_cbranch_execnz .LBB5_8
1125; GFX908-NEXT:  .LBB5_2: ; %atomicrmw.phi
1126; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1127; GFX908-NEXT:    s_waitcnt vmcnt(0)
1128; GFX908-NEXT:    s_setpc_b64 s[30:31]
1129; GFX908-NEXT:  .LBB5_3: ; %atomicrmw.check.private
1130; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
1131; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
1132; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1133; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
1134; GFX908-NEXT:    s_cbranch_execz .LBB5_5
1135; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
1136; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
1137; GFX908-NEXT:    s_waitcnt vmcnt(0)
1138; GFX908-NEXT:    buffer_wbinvl1
1139; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
1140; GFX908-NEXT:    ; implicit-def: $vgpr2
1141; GFX908-NEXT:  .LBB5_5: ; %Flow
1142; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
1143; GFX908-NEXT:    s_cbranch_execz .LBB5_7
1144; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
1145; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1146; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1147; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
1148; GFX908-NEXT:    s_waitcnt vmcnt(0)
1149; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
1150; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
1151; GFX908-NEXT:  .LBB5_7: ; %Flow1
1152; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
1153; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
1154; GFX908-NEXT:    ; implicit-def: $vgpr2
1155; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
1156; GFX908-NEXT:    s_cbranch_execz .LBB5_2
1157; GFX908-NEXT:  .LBB5_8: ; %atomicrmw.shared
1158; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1159; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1160; GFX908-NEXT:    ds_add_f32 v0, v2
1161; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
1162; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1163; GFX908-NEXT:    s_waitcnt vmcnt(0)
1164; GFX908-NEXT:    s_setpc_b64 s[30:31]
1165;
1166; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1167; GFX8:       ; %bb.0:
1168; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1169; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
1170; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
1171; GFX8-NEXT:    flat_load_dword v4, v[0:1]
1172; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1173; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
1174; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1175; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1176; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
1177; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1178; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1179; GFX8-NEXT:    buffer_wbinvl1
1180; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1181; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1182; GFX8-NEXT:    v_mov_b32_e32 v4, v3
1183; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1184; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
1185; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1186; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1187; GFX8-NEXT:    s_setpc_b64 s[30:31]
1188;
1189; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1190; GFX7:       ; %bb.0:
1191; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1192; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
1193; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
1194; GFX7-NEXT:    flat_load_dword v4, v[0:1]
1195; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1196; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
1197; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1198; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1199; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
1200; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1201; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1202; GFX7-NEXT:    buffer_wbinvl1
1203; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1204; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1205; GFX7-NEXT:    v_mov_b32_e32 v4, v3
1206; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1207; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
1208; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1209; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1210; GFX7-NEXT:    s_setpc_b64 s[30:31]
1211  %gep = getelementptr float, ptr %ptr, i64 -512
1212  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
1213  ret void
1214}
1215
1216define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
1217; GFX12-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1218; GFX12:       ; %bb.0:
1219; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1220; GFX12-NEXT:    s_wait_expcnt 0x0
1221; GFX12-NEXT:    s_wait_samplecnt 0x0
1222; GFX12-NEXT:    s_wait_bvhcnt 0x0
1223; GFX12-NEXT:    s_wait_kmcnt 0x0
1224; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1225; GFX12-NEXT:    s_wait_storecnt 0x0
1226; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1227; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1228; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1229; GFX12-NEXT:    s_setpc_b64 s[30:31]
1230;
1231; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1232; GFX940:       ; %bb.0:
1233; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1234; GFX940-NEXT:    buffer_wbl2 sc0 sc1
1235; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1
1236; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1237; GFX940-NEXT:    buffer_inv sc0 sc1
1238; GFX940-NEXT:    s_setpc_b64 s[30:31]
1239;
1240; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1241; GFX11:       ; %bb.0:
1242; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1243; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1244; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc
1245; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1246; GFX11-NEXT:    buffer_gl1_inv
1247; GFX11-NEXT:    buffer_gl0_inv
1248; GFX11-NEXT:    s_setpc_b64 s[30:31]
1249;
1250; GFX10-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1251; GFX10:       ; %bb.0:
1252; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1253; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
1254; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
1255; GFX10-NEXT:    s_mov_b32 s4, 0
1256; GFX10-NEXT:    flat_load_dword v0, v[3:4]
1257; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
1258; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1259; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1260; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1261; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
1262; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1263; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
1264; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1265; GFX10-NEXT:    buffer_gl1_inv
1266; GFX10-NEXT:    buffer_gl0_inv
1267; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
1268; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1269; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1270; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
1271; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1272; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1273; GFX10-NEXT:    s_setpc_b64 s[30:31]
1274;
1275; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1276; GFX90A:       ; %bb.0:
1277; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1278; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
1279; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
1280; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
1281; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
1282; GFX90A-NEXT:    ; implicit-def: $vgpr0
1283; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1284; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
1285; GFX90A-NEXT:    s_cbranch_execnz .LBB6_3
1286; GFX90A-NEXT:  ; %bb.1: ; %Flow2
1287; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
1288; GFX90A-NEXT:    s_cbranch_execnz .LBB6_8
1289; GFX90A-NEXT:  .LBB6_2: ; %atomicrmw.phi
1290; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1291; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1292; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1293; GFX90A-NEXT:  .LBB6_3: ; %atomicrmw.check.private
1294; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
1295; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v5
1296; GFX90A-NEXT:    ; implicit-def: $vgpr0
1297; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1298; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
1299; GFX90A-NEXT:    s_cbranch_execz .LBB6_5
1300; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
1301; GFX90A-NEXT:    buffer_wbl2
1302; GFX90A-NEXT:    global_atomic_add_f32 v0, v[4:5], v2, off glc
1303; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1304; GFX90A-NEXT:    buffer_invl2
1305; GFX90A-NEXT:    buffer_wbinvl1
1306; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
1307; GFX90A-NEXT:    ; implicit-def: $vgpr2
1308; GFX90A-NEXT:  .LBB6_5: ; %Flow
1309; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
1310; GFX90A-NEXT:    s_cbranch_execz .LBB6_7
1311; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
1312; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
1313; GFX90A-NEXT:    v_cndmask_b32_e32 v1, -1, v4, vcc
1314; GFX90A-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
1315; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1316; GFX90A-NEXT:    v_add_f32_e32 v2, v0, v2
1317; GFX90A-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
1318; GFX90A-NEXT:  .LBB6_7: ; %Flow1
1319; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
1320; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
1321; GFX90A-NEXT:    ; implicit-def: $vgpr2
1322; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
1323; GFX90A-NEXT:    s_cbranch_execz .LBB6_2
1324; GFX90A-NEXT:  .LBB6_8: ; %atomicrmw.shared
1325; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
1326; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc
1327; GFX90A-NEXT:    ds_add_rtn_f32 v0, v0, v2
1328; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1329; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1330; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1331; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1332;
1333; GFX908-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1334; GFX908:       ; %bb.0:
1335; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1336; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1337; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1338; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
1339; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1340; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1341; GFX908-NEXT:    v_mov_b32_e32 v4, v3
1342; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
1343; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
1344; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1345; GFX908-NEXT:    buffer_wbinvl1
1346; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1347; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1348; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1349; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
1350; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1351; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1352; GFX908-NEXT:    v_mov_b32_e32 v0, v3
1353; GFX908-NEXT:    s_setpc_b64 s[30:31]
1354;
1355; GFX8-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1356; GFX8:       ; %bb.0:
1357; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1358; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
1359; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
1360; GFX8-NEXT:    flat_load_dword v0, v[3:4]
1361; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1362; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
1363; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1364; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1365; GFX8-NEXT:    v_mov_b32_e32 v1, v0
1366; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
1367; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
1368; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1369; GFX8-NEXT:    buffer_wbinvl1
1370; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
1371; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1372; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1373; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
1374; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1375; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1376; GFX8-NEXT:    s_setpc_b64 s[30:31]
1377;
1378; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1379; GFX7:       ; %bb.0:
1380; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1381; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
1382; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
1383; GFX7-NEXT:    flat_load_dword v0, v[3:4]
1384; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1385; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
1386; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1387; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1388; GFX7-NEXT:    v_mov_b32_e32 v1, v0
1389; GFX7-NEXT:    v_add_f32_e32 v0, v1, v2
1390; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
1391; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1392; GFX7-NEXT:    buffer_wbinvl1
1393; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
1394; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1395; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1396; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
1397; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1398; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1399; GFX7-NEXT:    s_setpc_b64 s[30:31]
1400  %gep = getelementptr float, ptr %ptr, i64 511
1401  %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
1402  ret float %result
1403}
1404
1405define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
1406; GFX12-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1407; GFX12:       ; %bb.0:
1408; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1409; GFX12-NEXT:    s_wait_expcnt 0x0
1410; GFX12-NEXT:    s_wait_samplecnt 0x0
1411; GFX12-NEXT:    s_wait_bvhcnt 0x0
1412; GFX12-NEXT:    s_wait_kmcnt 0x0
1413; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1414; GFX12-NEXT:    s_wait_storecnt 0x0
1415; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS
1416; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
1417; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1418; GFX12-NEXT:    s_setpc_b64 s[30:31]
1419;
1420; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1421; GFX940:       ; %bb.0:
1422; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1423; GFX940-NEXT:    buffer_wbl2 sc0 sc1
1424; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1
1425; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1426; GFX940-NEXT:    buffer_inv sc0 sc1
1427; GFX940-NEXT:    s_setpc_b64 s[30:31]
1428;
1429; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1430; GFX11:       ; %bb.0:
1431; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1432; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1433; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
1434; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1435; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1436; GFX11-NEXT:    buffer_gl1_inv
1437; GFX11-NEXT:    buffer_gl0_inv
1438; GFX11-NEXT:    s_setpc_b64 s[30:31]
1439;
1440; GFX10-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1441; GFX10:       ; %bb.0:
1442; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1443; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
1444; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1445; GFX10-NEXT:    s_mov_b32 s4, 0
1446; GFX10-NEXT:    flat_load_dword v4, v[0:1]
1447; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
1448; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1449; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1450; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
1451; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1452; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1453; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1454; GFX10-NEXT:    buffer_gl1_inv
1455; GFX10-NEXT:    buffer_gl0_inv
1456; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1457; GFX10-NEXT:    v_mov_b32_e32 v4, v3
1458; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1459; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1460; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
1461; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1462; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1463; GFX10-NEXT:    s_setpc_b64 s[30:31]
1464;
1465; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1466; GFX90A:       ; %bb.0:
1467; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1468; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
1469; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1470; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
1471; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
1472; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1473; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
1474; GFX90A-NEXT:    s_cbranch_execnz .LBB7_3
1475; GFX90A-NEXT:  ; %bb.1: ; %Flow2
1476; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
1477; GFX90A-NEXT:    s_cbranch_execnz .LBB7_8
1478; GFX90A-NEXT:  .LBB7_2: ; %atomicrmw.phi
1479; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1480; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1481; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1482; GFX90A-NEXT:  .LBB7_3: ; %atomicrmw.check.private
1483; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
1484; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
1485; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1486; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
1487; GFX90A-NEXT:    s_cbranch_execz .LBB7_5
1488; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
1489; GFX90A-NEXT:    buffer_wbl2
1490; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
1491; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1492; GFX90A-NEXT:    buffer_invl2
1493; GFX90A-NEXT:    buffer_wbinvl1
1494; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
1495; GFX90A-NEXT:    ; implicit-def: $vgpr2
1496; GFX90A-NEXT:  .LBB7_5: ; %Flow
1497; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
1498; GFX90A-NEXT:    s_cbranch_execz .LBB7_7
1499; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
1500; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1501; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1502; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
1503; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1504; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
1505; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
1506; GFX90A-NEXT:  .LBB7_7: ; %Flow1
1507; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
1508; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
1509; GFX90A-NEXT:    ; implicit-def: $vgpr2
1510; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
1511; GFX90A-NEXT:    s_cbranch_execz .LBB7_2
1512; GFX90A-NEXT:  .LBB7_8: ; %atomicrmw.shared
1513; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1514; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1515; GFX90A-NEXT:    ds_add_f32 v0, v2
1516; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1517; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1518; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1519; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1520;
1521; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1522; GFX908:       ; %bb.0:
1523; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1524; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
1525; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1526; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
1527; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
1528; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1529; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
1530; GFX908-NEXT:    s_cbranch_execnz .LBB7_3
1531; GFX908-NEXT:  ; %bb.1: ; %Flow2
1532; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
1533; GFX908-NEXT:    s_cbranch_execnz .LBB7_8
1534; GFX908-NEXT:  .LBB7_2: ; %atomicrmw.phi
1535; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1536; GFX908-NEXT:    s_waitcnt vmcnt(0)
1537; GFX908-NEXT:    s_setpc_b64 s[30:31]
1538; GFX908-NEXT:  .LBB7_3: ; %atomicrmw.check.private
1539; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
1540; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
1541; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1542; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
1543; GFX908-NEXT:    s_cbranch_execz .LBB7_5
1544; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
1545; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
1546; GFX908-NEXT:    s_waitcnt vmcnt(0)
1547; GFX908-NEXT:    buffer_wbinvl1
1548; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
1549; GFX908-NEXT:    ; implicit-def: $vgpr2
1550; GFX908-NEXT:  .LBB7_5: ; %Flow
1551; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
1552; GFX908-NEXT:    s_cbranch_execz .LBB7_7
1553; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
1554; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1555; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1556; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
1557; GFX908-NEXT:    s_waitcnt vmcnt(0)
1558; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
1559; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
1560; GFX908-NEXT:  .LBB7_7: ; %Flow1
1561; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
1562; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
1563; GFX908-NEXT:    ; implicit-def: $vgpr2
1564; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
1565; GFX908-NEXT:    s_cbranch_execz .LBB7_2
1566; GFX908-NEXT:  .LBB7_8: ; %atomicrmw.shared
1567; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1568; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1569; GFX908-NEXT:    ds_add_f32 v0, v2
1570; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
1571; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1572; GFX908-NEXT:    s_waitcnt vmcnt(0)
1573; GFX908-NEXT:    s_setpc_b64 s[30:31]
1574;
1575; GFX8-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1576; GFX8:       ; %bb.0:
1577; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1578; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
1579; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1580; GFX8-NEXT:    flat_load_dword v4, v[0:1]
1581; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1582; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
1583; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1584; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1585; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
1586; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1587; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1588; GFX8-NEXT:    buffer_wbinvl1
1589; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1590; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1591; GFX8-NEXT:    v_mov_b32_e32 v4, v3
1592; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1593; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
1594; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1595; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1596; GFX8-NEXT:    s_setpc_b64 s[30:31]
1597;
1598; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1599; GFX7:       ; %bb.0:
1600; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1601; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
1602; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1603; GFX7-NEXT:    flat_load_dword v4, v[0:1]
1604; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1605; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
1606; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1607; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1608; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
1609; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1610; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1611; GFX7-NEXT:    buffer_wbinvl1
1612; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1613; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1614; GFX7-NEXT:    v_mov_b32_e32 v4, v3
1615; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1616; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
1617; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1618; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1619; GFX7-NEXT:    s_setpc_b64 s[30:31]
1620  %gep = getelementptr float, ptr %ptr, i64 511
1621  %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
1622  ret void
1623}
1624
1625define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) {
1626; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
1627; GFX12:       ; %bb.0:
1628; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1629; GFX12-NEXT:    s_wait_expcnt 0x0
1630; GFX12-NEXT:    s_wait_samplecnt 0x0
1631; GFX12-NEXT:    s_wait_bvhcnt 0x0
1632; GFX12-NEXT:    s_wait_kmcnt 0x0
1633; GFX12-NEXT:    s_wait_storecnt 0x0
1634; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
1635; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
1636; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1637; GFX12-NEXT:    s_setpc_b64 s[30:31]
1638;
1639; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
1640; GFX940:       ; %bb.0:
1641; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1642; GFX940-NEXT:    buffer_wbl2 sc1
1643; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
1644; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1645; GFX940-NEXT:    buffer_inv sc1
1646; GFX940-NEXT:    s_setpc_b64 s[30:31]
1647;
1648; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
1649; GFX11:       ; %bb.0:
1650; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1651; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
1652; GFX11-NEXT:    s_mov_b32 s0, 0
1653; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
1654; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1655; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1656; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
1657; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1658; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
1659; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1660; GFX11-NEXT:    buffer_gl1_inv
1661; GFX11-NEXT:    buffer_gl0_inv
1662; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1663; GFX11-NEXT:    v_mov_b32_e32 v4, v3
1664; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
1665; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1666; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1667; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
1668; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1669; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1670; GFX11-NEXT:    s_setpc_b64 s[30:31]
1671;
1672; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
1673; GFX10:       ; %bb.0:
1674; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1675; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
1676; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1677; GFX10-NEXT:    s_mov_b32 s4, 0
1678; GFX10-NEXT:    flat_load_dword v4, v[0:1]
1679; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
1680; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1681; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1682; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
1683; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1684; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1685; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1686; GFX10-NEXT:    buffer_gl1_inv
1687; GFX10-NEXT:    buffer_gl0_inv
1688; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1689; GFX10-NEXT:    v_mov_b32_e32 v4, v3
1690; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1691; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1692; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
1693; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1694; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1695; GFX10-NEXT:    s_setpc_b64 s[30:31]
1696;
1697; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
1698; GFX90A:       ; %bb.0:
1699; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1700; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
1701; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1702; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
1703; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1704; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1705; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
1706; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
1707; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1708; GFX90A-NEXT:    buffer_wbinvl1
1709; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1710; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1711; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
1712; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1713; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
1714; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1715; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1716; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1717;
1718; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
1719; GFX908:       ; %bb.0:
1720; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1721; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
1722; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1723; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
1724; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1725; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1726; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
1727; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
1728; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1729; GFX908-NEXT:    buffer_wbinvl1
1730; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1731; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1732; GFX908-NEXT:    v_mov_b32_e32 v4, v3
1733; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1734; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
1735; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1736; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1737; GFX908-NEXT:    s_setpc_b64 s[30:31]
1738;
1739; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
1740; GFX8:       ; %bb.0:
1741; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1742; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
1743; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1744; GFX8-NEXT:    flat_load_dword v4, v[0:1]
1745; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1746; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
1747; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1748; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1749; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
1750; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1751; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1752; GFX8-NEXT:    buffer_wbinvl1
1753; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1754; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1755; GFX8-NEXT:    v_mov_b32_e32 v4, v3
1756; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1757; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
1758; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1759; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1760; GFX8-NEXT:    s_setpc_b64 s[30:31]
1761;
1762; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
1763; GFX7:       ; %bb.0:
1764; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1765; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
1766; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1767; GFX7-NEXT:    flat_load_dword v4, v[0:1]
1768; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1769; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
1770; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1771; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1772; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
1773; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1774; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1775; GFX7-NEXT:    buffer_wbinvl1
1776; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1777; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1778; GFX7-NEXT:    v_mov_b32_e32 v4, v3
1779; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1780; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
1781; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1782; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1783; GFX7-NEXT:    s_setpc_b64 s[30:31]
1784  %gep = getelementptr float, ptr %ptr, i64 511
1785  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
1786  ret void
1787}
1788
1789define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
1790; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
1791; GFX12:       ; %bb.0:
1792; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1793; GFX12-NEXT:    s_wait_expcnt 0x0
1794; GFX12-NEXT:    s_wait_samplecnt 0x0
1795; GFX12-NEXT:    s_wait_bvhcnt 0x0
1796; GFX12-NEXT:    s_wait_kmcnt 0x0
1797; GFX12-NEXT:    s_wait_storecnt 0x0
1798; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
1799; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
1800; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1801; GFX12-NEXT:    s_setpc_b64 s[30:31]
1802;
1803; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
1804; GFX940:       ; %bb.0:
1805; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1806; GFX940-NEXT:    buffer_wbl2 sc1
1807; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
1808; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1809; GFX940-NEXT:    buffer_inv sc1
1810; GFX940-NEXT:    s_setpc_b64 s[30:31]
1811;
1812; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
1813; GFX11:       ; %bb.0:
1814; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1815; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1816; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
1817; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1818; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1819; GFX11-NEXT:    buffer_gl1_inv
1820; GFX11-NEXT:    buffer_gl0_inv
1821; GFX11-NEXT:    s_setpc_b64 s[30:31]
1822;
1823; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
1824; GFX10:       ; %bb.0:
1825; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1826; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
1827; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1828; GFX10-NEXT:    s_mov_b32 s4, 0
1829; GFX10-NEXT:    flat_load_dword v4, v[0:1]
1830; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
1831; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1832; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1833; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
1834; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1835; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1836; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1837; GFX10-NEXT:    buffer_gl1_inv
1838; GFX10-NEXT:    buffer_gl0_inv
1839; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1840; GFX10-NEXT:    v_mov_b32_e32 v4, v3
1841; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1842; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1843; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
1844; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1845; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1846; GFX10-NEXT:    s_setpc_b64 s[30:31]
1847;
1848; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
1849; GFX90A:       ; %bb.0:
1850; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1851; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
1852; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1853; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
1854; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1855; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1856; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
1857; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
1858; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1859; GFX90A-NEXT:    buffer_wbinvl1
1860; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1861; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1862; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
1863; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1864; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
1865; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1866; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1867; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1868;
1869; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
1870; GFX908:       ; %bb.0:
1871; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1872; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
1873; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1874; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
1875; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1876; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1877; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
1878; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
1879; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1880; GFX908-NEXT:    buffer_wbinvl1
1881; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1882; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1883; GFX908-NEXT:    v_mov_b32_e32 v4, v3
1884; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1885; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
1886; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1887; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1888; GFX908-NEXT:    s_setpc_b64 s[30:31]
1889;
1890; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
1891; GFX8:       ; %bb.0:
1892; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1893; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
1894; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1895; GFX8-NEXT:    flat_load_dword v4, v[0:1]
1896; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1897; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
1898; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1899; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1900; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
1901; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1902; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1903; GFX8-NEXT:    buffer_wbinvl1
1904; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1905; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1906; GFX8-NEXT:    v_mov_b32_e32 v4, v3
1907; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1908; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
1909; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1910; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1911; GFX8-NEXT:    s_setpc_b64 s[30:31]
1912;
1913; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
1914; GFX7:       ; %bb.0:
1915; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1916; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
1917; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1918; GFX7-NEXT:    flat_load_dword v4, v[0:1]
1919; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1920; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
1921; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1922; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1923; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
1924; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1925; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1926; GFX7-NEXT:    buffer_wbinvl1
1927; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1928; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1929; GFX7-NEXT:    v_mov_b32_e32 v4, v3
1930; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1931; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
1932; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1933; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1934; GFX7-NEXT:    s_setpc_b64 s[30:31]
1935  %gep = getelementptr float, ptr %ptr, i64 511
1936  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1937  ret void
1938}
1939
1940define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
1941; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1942; GFX12:       ; %bb.0:
1943; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1944; GFX12-NEXT:    s_wait_expcnt 0x0
1945; GFX12-NEXT:    s_wait_samplecnt 0x0
1946; GFX12-NEXT:    s_wait_bvhcnt 0x0
1947; GFX12-NEXT:    s_wait_kmcnt 0x0
1948; GFX12-NEXT:    s_wait_storecnt 0x0
1949; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
1950; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
1951; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1952; GFX12-NEXT:    s_setpc_b64 s[30:31]
1953;
1954; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1955; GFX940:       ; %bb.0:
1956; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1957; GFX940-NEXT:    buffer_wbl2 sc1
1958; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
1959; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1960; GFX940-NEXT:    buffer_inv sc1
1961; GFX940-NEXT:    s_setpc_b64 s[30:31]
1962;
1963; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1964; GFX11:       ; %bb.0:
1965; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1966; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1967; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
1968; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1969; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1970; GFX11-NEXT:    buffer_gl1_inv
1971; GFX11-NEXT:    buffer_gl0_inv
1972; GFX11-NEXT:    s_setpc_b64 s[30:31]
1973;
1974; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
1975; GFX10:       ; %bb.0:
1976; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1977; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
1978; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1979; GFX10-NEXT:    s_mov_b32 s4, 0
1980; GFX10-NEXT:    flat_load_dword v4, v[0:1]
1981; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
1982; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1983; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1984; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
1985; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1986; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1987; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1988; GFX10-NEXT:    buffer_gl1_inv
1989; GFX10-NEXT:    buffer_gl0_inv
1990; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1991; GFX10-NEXT:    v_mov_b32_e32 v4, v3
1992; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1993; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1994; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
1995; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1996; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1997; GFX10-NEXT:    s_setpc_b64 s[30:31]
1998;
1999; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
2000; GFX90A:       ; %bb.0:
2001; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2002; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
2003; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2004; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
2005; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
2006; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2007; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2008; GFX90A-NEXT:    s_cbranch_execnz .LBB10_3
2009; GFX90A-NEXT:  ; %bb.1: ; %Flow2
2010; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
2011; GFX90A-NEXT:    s_cbranch_execnz .LBB10_8
2012; GFX90A-NEXT:  .LBB10_2: ; %atomicrmw.phi
2013; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2014; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2015; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2016; GFX90A-NEXT:  .LBB10_3: ; %atomicrmw.check.private
2017; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
2018; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
2019; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
2020; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
2021; GFX90A-NEXT:    s_cbranch_execz .LBB10_5
2022; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
2023; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
2024; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2025; GFX90A-NEXT:    buffer_wbinvl1
2026; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
2027; GFX90A-NEXT:    ; implicit-def: $vgpr2
2028; GFX90A-NEXT:  .LBB10_5: ; %Flow
2029; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
2030; GFX90A-NEXT:    s_cbranch_execz .LBB10_7
2031; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
2032; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2033; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
2034; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
2035; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2036; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
2037; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
2038; GFX90A-NEXT:  .LBB10_7: ; %Flow1
2039; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
2040; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
2041; GFX90A-NEXT:    ; implicit-def: $vgpr2
2042; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
2043; GFX90A-NEXT:    s_cbranch_execz .LBB10_2
2044; GFX90A-NEXT:  .LBB10_8: ; %atomicrmw.shared
2045; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2046; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
2047; GFX90A-NEXT:    ds_add_f32 v0, v2
2048; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
2049; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2050; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2051; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2052;
2053; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
2054; GFX908:       ; %bb.0:
2055; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2056; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
2057; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2058; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
2059; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
2060; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2061; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2062; GFX908-NEXT:    s_cbranch_execnz .LBB10_3
2063; GFX908-NEXT:  ; %bb.1: ; %Flow2
2064; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
2065; GFX908-NEXT:    s_cbranch_execnz .LBB10_8
2066; GFX908-NEXT:  .LBB10_2: ; %atomicrmw.phi
2067; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2068; GFX908-NEXT:    s_waitcnt vmcnt(0)
2069; GFX908-NEXT:    s_setpc_b64 s[30:31]
2070; GFX908-NEXT:  .LBB10_3: ; %atomicrmw.check.private
2071; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
2072; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
2073; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
2074; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
2075; GFX908-NEXT:    s_cbranch_execz .LBB10_5
2076; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
2077; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
2078; GFX908-NEXT:    s_waitcnt vmcnt(0)
2079; GFX908-NEXT:    buffer_wbinvl1
2080; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
2081; GFX908-NEXT:    ; implicit-def: $vgpr2
2082; GFX908-NEXT:  .LBB10_5: ; %Flow
2083; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
2084; GFX908-NEXT:    s_cbranch_execz .LBB10_7
2085; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
2086; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2087; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
2088; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
2089; GFX908-NEXT:    s_waitcnt vmcnt(0)
2090; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
2091; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
2092; GFX908-NEXT:  .LBB10_7: ; %Flow1
2093; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
2094; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
2095; GFX908-NEXT:    ; implicit-def: $vgpr2
2096; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
2097; GFX908-NEXT:    s_cbranch_execz .LBB10_2
2098; GFX908-NEXT:  .LBB10_8: ; %atomicrmw.shared
2099; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2100; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
2101; GFX908-NEXT:    ds_add_f32 v0, v2
2102; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
2103; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2104; GFX908-NEXT:    s_waitcnt vmcnt(0)
2105; GFX908-NEXT:    s_setpc_b64 s[30:31]
2106;
2107; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
2108; GFX8:       ; %bb.0:
2109; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2110; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
2111; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2112; GFX8-NEXT:    flat_load_dword v4, v[0:1]
2113; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2114; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
2115; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2116; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2117; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
2118; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2119; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2120; GFX8-NEXT:    buffer_wbinvl1
2121; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2122; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2123; GFX8-NEXT:    v_mov_b32_e32 v4, v3
2124; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2125; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
2126; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2127; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2128; GFX8-NEXT:    s_setpc_b64 s[30:31]
2129;
2130; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
2131; GFX7:       ; %bb.0:
2132; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2133; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
2134; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2135; GFX7-NEXT:    flat_load_dword v4, v[0:1]
2136; GFX7-NEXT:    s_mov_b64 s[4:5], 0
2137; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
2138; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2139; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2140; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
2141; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2142; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2143; GFX7-NEXT:    buffer_wbinvl1
2144; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2145; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2146; GFX7-NEXT:    v_mov_b32_e32 v4, v3
2147; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2148; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
2149; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2150; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2151; GFX7-NEXT:    s_setpc_b64 s[30:31]
2152  %gep = getelementptr float, ptr %ptr, i64 511
2153  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
2154  ret void
2155}
2156
2157define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
2158; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
2159; GFX12:       ; %bb.0:
2160; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2161; GFX12-NEXT:    s_wait_expcnt 0x0
2162; GFX12-NEXT:    s_wait_samplecnt 0x0
2163; GFX12-NEXT:    s_wait_bvhcnt 0x0
2164; GFX12-NEXT:    s_wait_kmcnt 0x0
2165; GFX12-NEXT:    s_wait_storecnt 0x0
2166; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
2167; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
2168; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2169; GFX12-NEXT:    s_setpc_b64 s[30:31]
2170;
2171; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
2172; GFX940:       ; %bb.0:
2173; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2174; GFX940-NEXT:    buffer_wbl2 sc1
2175; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
2176; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2177; GFX940-NEXT:    buffer_inv sc1
2178; GFX940-NEXT:    s_setpc_b64 s[30:31]
2179;
2180; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
2181; GFX11:       ; %bb.0:
2182; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2183; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
2184; GFX11-NEXT:    s_mov_b32 s0, 0
2185; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
2186; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2187; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2188; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
2189; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2190; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
2191; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2192; GFX11-NEXT:    buffer_gl1_inv
2193; GFX11-NEXT:    buffer_gl0_inv
2194; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2195; GFX11-NEXT:    v_mov_b32_e32 v4, v3
2196; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
2197; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2198; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
2199; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
2200; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2201; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2202; GFX11-NEXT:    s_setpc_b64 s[30:31]
2203;
2204; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
2205; GFX10:       ; %bb.0:
2206; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2207; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
2208; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2209; GFX10-NEXT:    s_mov_b32 s4, 0
2210; GFX10-NEXT:    flat_load_dword v4, v[0:1]
2211; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
2212; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2213; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2214; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
2215; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2216; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2217; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2218; GFX10-NEXT:    buffer_gl1_inv
2219; GFX10-NEXT:    buffer_gl0_inv
2220; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2221; GFX10-NEXT:    v_mov_b32_e32 v4, v3
2222; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
2223; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
2224; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
2225; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2226; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2227; GFX10-NEXT:    s_setpc_b64 s[30:31]
2228;
2229; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
2230; GFX90A:       ; %bb.0:
2231; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2232; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
2233; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
2234; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
2235; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
2236; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2237; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
2238; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
2239; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2240; GFX90A-NEXT:    buffer_wbinvl1
2241; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
2242; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2243; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
2244; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2245; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
2246; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
2247; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2248; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2249;
2250; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
2251; GFX908:       ; %bb.0:
2252; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2253; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
2254; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2255; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
2256; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2257; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2258; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
2259; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
2260; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2261; GFX908-NEXT:    buffer_wbinvl1
2262; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2263; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2264; GFX908-NEXT:    v_mov_b32_e32 v4, v3
2265; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2266; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
2267; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2268; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2269; GFX908-NEXT:    s_setpc_b64 s[30:31]
2270;
2271; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
2272; GFX8:       ; %bb.0:
2273; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2274; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
2275; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2276; GFX8-NEXT:    flat_load_dword v4, v[0:1]
2277; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2278; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
2279; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2280; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2281; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
2282; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2283; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2284; GFX8-NEXT:    buffer_wbinvl1
2285; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2286; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2287; GFX8-NEXT:    v_mov_b32_e32 v4, v3
2288; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2289; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
2290; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2291; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2292; GFX8-NEXT:    s_setpc_b64 s[30:31]
2293;
2294; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
2295; GFX7:       ; %bb.0:
2296; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2297; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
2298; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2299; GFX7-NEXT:    flat_load_dword v4, v[0:1]
2300; GFX7-NEXT:    s_mov_b64 s[4:5], 0
2301; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
2302; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2303; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2304; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
2305; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2306; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2307; GFX7-NEXT:    buffer_wbinvl1
2308; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2309; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2310; GFX7-NEXT:    v_mov_b32_e32 v4, v3
2311; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2312; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
2313; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2314; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2315; GFX7-NEXT:    s_setpc_b64 s[30:31]
2316  %gep = getelementptr float, ptr %ptr, i64 511
2317  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0
2318  ret void
2319}
2320
2321; --------------------------------------------------------------------
2322; float with ftz/daz
2323; --------------------------------------------------------------------
2324
2325define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
2326; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
2327; GFX12:       ; %bb.0:
2328; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2329; GFX12-NEXT:    s_wait_expcnt 0x0
2330; GFX12-NEXT:    s_wait_samplecnt 0x0
2331; GFX12-NEXT:    s_wait_bvhcnt 0x0
2332; GFX12-NEXT:    s_wait_kmcnt 0x0
2333; GFX12-NEXT:    s_wait_storecnt 0x0
2334; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2335; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2336; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2337; GFX12-NEXT:    s_setpc_b64 s[30:31]
2338;
2339; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
2340; GFX940:       ; %bb.0:
2341; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2342; GFX940-NEXT:    buffer_wbl2 sc1
2343; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
2344; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2345; GFX940-NEXT:    buffer_inv sc1
2346; GFX940-NEXT:    s_setpc_b64 s[30:31]
2347;
2348; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
2349; GFX11:       ; %bb.0:
2350; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2351; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2352; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
2353; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2354; GFX11-NEXT:    buffer_gl1_inv
2355; GFX11-NEXT:    buffer_gl0_inv
2356; GFX11-NEXT:    s_setpc_b64 s[30:31]
2357;
2358; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
2359; GFX10:       ; %bb.0:
2360; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2361; GFX10-NEXT:    flat_load_dword v3, v[0:1]
2362; GFX10-NEXT:    s_mov_b32 s4, 0
2363; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
2364; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2365; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2366; GFX10-NEXT:    v_mov_b32_e32 v4, v3
2367; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
2368; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2369; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2370; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2371; GFX10-NEXT:    buffer_gl1_inv
2372; GFX10-NEXT:    buffer_gl0_inv
2373; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2374; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
2375; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
2376; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
2377; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2378; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2379; GFX10-NEXT:    v_mov_b32_e32 v0, v3
2380; GFX10-NEXT:    s_setpc_b64 s[30:31]
2381;
2382; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
2383; GFX90A:       ; %bb.0:
2384; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2385; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
2386; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
2387; GFX90A-NEXT:    ; implicit-def: $vgpr3
2388; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2389; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2390; GFX90A-NEXT:    s_cbranch_execz .LBB12_6
2391; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
2392; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
2393; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
2394; GFX90A-NEXT:    ; implicit-def: $vgpr3
2395; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
2396; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
2397; GFX90A-NEXT:    s_cbranch_execz .LBB12_3
2398; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
2399; GFX90A-NEXT:    global_atomic_add_f32 v3, v[0:1], v2, off glc
2400; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2401; GFX90A-NEXT:    buffer_wbinvl1
2402; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
2403; GFX90A-NEXT:    ; implicit-def: $vgpr2
2404; GFX90A-NEXT:  .LBB12_3: ; %Flow
2405; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
2406; GFX90A-NEXT:    s_cbranch_execz .LBB12_5
2407; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.private
2408; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2409; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
2410; GFX90A-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
2411; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2412; GFX90A-NEXT:    v_add_f32_e32 v1, v3, v2
2413; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
2414; GFX90A-NEXT:  .LBB12_5: ; %Flow1
2415; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
2416; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
2417; GFX90A-NEXT:    ; implicit-def: $vgpr2
2418; GFX90A-NEXT:  .LBB12_6: ; %Flow2
2419; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
2420; GFX90A-NEXT:    s_cbranch_execz .LBB12_8
2421; GFX90A-NEXT:  ; %bb.7: ; %atomicrmw.shared
2422; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2423; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
2424; GFX90A-NEXT:    ds_add_rtn_f32 v3, v0, v2
2425; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
2426; GFX90A-NEXT:  .LBB12_8: ; %atomicrmw.phi
2427; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2428; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
2429; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2430; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2431;
2432; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
2433; GFX908:       ; %bb.0:
2434; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2435; GFX908-NEXT:    flat_load_dword v3, v[0:1]
2436; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2437; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
2438; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2439; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2440; GFX908-NEXT:    v_mov_b32_e32 v4, v3
2441; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
2442; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2443; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2444; GFX908-NEXT:    buffer_wbinvl1
2445; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2446; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2447; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2448; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
2449; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2450; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2451; GFX908-NEXT:    v_mov_b32_e32 v0, v3
2452; GFX908-NEXT:    s_setpc_b64 s[30:31]
2453;
2454; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
2455; GFX8:       ; %bb.0:
2456; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2457; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2458; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2459; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
2460; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2461; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2462; GFX8-NEXT:    v_mov_b32_e32 v4, v3
2463; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
2464; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2465; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2466; GFX8-NEXT:    buffer_wbinvl1
2467; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2468; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2469; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2470; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
2471; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2472; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2473; GFX8-NEXT:    v_mov_b32_e32 v0, v3
2474; GFX8-NEXT:    s_setpc_b64 s[30:31]
2475;
2476; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
2477; GFX7:       ; %bb.0:
2478; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2479; GFX7-NEXT:    flat_load_dword v3, v[0:1]
2480; GFX7-NEXT:    s_mov_b64 s[4:5], 0
2481; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
2482; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2483; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2484; GFX7-NEXT:    v_mov_b32_e32 v4, v3
2485; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
2486; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2487; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2488; GFX7-NEXT:    buffer_wbinvl1
2489; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2490; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2491; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2492; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
2493; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2494; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2495; GFX7-NEXT:    v_mov_b32_e32 v0, v3
2496; GFX7-NEXT:    s_setpc_b64 s[30:31]
2497  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2498  ret float %result
2499}
2500
2501define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
2502; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2503; GFX12:       ; %bb.0:
2504; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2505; GFX12-NEXT:    s_wait_expcnt 0x0
2506; GFX12-NEXT:    s_wait_samplecnt 0x0
2507; GFX12-NEXT:    s_wait_bvhcnt 0x0
2508; GFX12-NEXT:    s_wait_kmcnt 0x0
2509; GFX12-NEXT:    s_wait_storecnt 0x0
2510; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2511; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2512; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2513; GFX12-NEXT:    s_setpc_b64 s[30:31]
2514;
2515; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2516; GFX940:       ; %bb.0:
2517; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2518; GFX940-NEXT:    buffer_wbl2 sc1
2519; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0
2520; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2521; GFX940-NEXT:    buffer_inv sc1
2522; GFX940-NEXT:    s_setpc_b64 s[30:31]
2523;
2524; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2525; GFX11:       ; %bb.0:
2526; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2527; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2528; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc
2529; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2530; GFX11-NEXT:    buffer_gl1_inv
2531; GFX11-NEXT:    buffer_gl0_inv
2532; GFX11-NEXT:    s_setpc_b64 s[30:31]
2533;
2534; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2535; GFX10:       ; %bb.0:
2536; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2537; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
2538; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
2539; GFX10-NEXT:    s_mov_b32 s4, 0
2540; GFX10-NEXT:    flat_load_dword v0, v[3:4]
2541; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
2542; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2543; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2544; GFX10-NEXT:    v_mov_b32_e32 v1, v0
2545; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
2546; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2547; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2548; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2549; GFX10-NEXT:    buffer_gl1_inv
2550; GFX10-NEXT:    buffer_gl0_inv
2551; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
2552; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
2553; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
2554; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
2555; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2556; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2557; GFX10-NEXT:    s_setpc_b64 s[30:31]
2558;
2559; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2560; GFX90A:       ; %bb.0:
2561; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2562; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
2563; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
2564; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
2565; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
2566; GFX90A-NEXT:    ; implicit-def: $vgpr0
2567; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2568; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2569; GFX90A-NEXT:    s_cbranch_execnz .LBB13_3
2570; GFX90A-NEXT:  ; %bb.1: ; %Flow2
2571; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
2572; GFX90A-NEXT:    s_cbranch_execnz .LBB13_8
2573; GFX90A-NEXT:  .LBB13_2: ; %atomicrmw.phi
2574; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2575; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2576; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2577; GFX90A-NEXT:  .LBB13_3: ; %atomicrmw.check.private
2578; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
2579; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v5
2580; GFX90A-NEXT:    ; implicit-def: $vgpr0
2581; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
2582; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
2583; GFX90A-NEXT:    s_cbranch_execz .LBB13_5
2584; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
2585; GFX90A-NEXT:    global_atomic_add_f32 v0, v[4:5], v2, off glc
2586; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2587; GFX90A-NEXT:    buffer_wbinvl1
2588; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
2589; GFX90A-NEXT:    ; implicit-def: $vgpr2
2590; GFX90A-NEXT:  .LBB13_5: ; %Flow
2591; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
2592; GFX90A-NEXT:    s_cbranch_execz .LBB13_7
2593; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
2594; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2595; GFX90A-NEXT:    v_cndmask_b32_e32 v1, -1, v4, vcc
2596; GFX90A-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
2597; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2598; GFX90A-NEXT:    v_add_f32_e32 v2, v0, v2
2599; GFX90A-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
2600; GFX90A-NEXT:  .LBB13_7: ; %Flow1
2601; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
2602; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
2603; GFX90A-NEXT:    ; implicit-def: $vgpr2
2604; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
2605; GFX90A-NEXT:    s_cbranch_execz .LBB13_2
2606; GFX90A-NEXT:  .LBB13_8: ; %atomicrmw.shared
2607; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2608; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc
2609; GFX90A-NEXT:    ds_add_rtn_f32 v0, v0, v2
2610; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
2611; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2612; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2613; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2614;
2615; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2616; GFX908:       ; %bb.0:
2617; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2618; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
2619; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2620; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
2621; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2622; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2623; GFX908-NEXT:    v_mov_b32_e32 v4, v3
2624; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
2625; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
2626; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2627; GFX908-NEXT:    buffer_wbinvl1
2628; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2629; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2630; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2631; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
2632; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2633; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2634; GFX908-NEXT:    v_mov_b32_e32 v0, v3
2635; GFX908-NEXT:    s_setpc_b64 s[30:31]
2636;
2637; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2638; GFX8:       ; %bb.0:
2639; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2640; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
2641; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
2642; GFX8-NEXT:    flat_load_dword v0, v[3:4]
2643; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2644; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
2645; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2646; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2647; GFX8-NEXT:    v_mov_b32_e32 v1, v0
2648; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
2649; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2650; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2651; GFX8-NEXT:    buffer_wbinvl1
2652; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2653; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2654; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2655; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
2656; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2657; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2658; GFX8-NEXT:    s_setpc_b64 s[30:31]
2659;
2660; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2661; GFX7:       ; %bb.0:
2662; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2663; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
2664; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
2665; GFX7-NEXT:    flat_load_dword v0, v[3:4]
2666; GFX7-NEXT:    s_mov_b64 s[4:5], 0
2667; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
2668; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2669; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2670; GFX7-NEXT:    v_mov_b32_e32 v1, v0
2671; GFX7-NEXT:    v_add_f32_e32 v0, v1, v2
2672; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2673; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2674; GFX7-NEXT:    buffer_wbinvl1
2675; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2676; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2677; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2678; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
2679; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2680; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2681; GFX7-NEXT:    s_setpc_b64 s[30:31]
2682  %gep = getelementptr float, ptr %ptr, i64 511
2683  %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2684  ret float %result
2685}
2686
2687define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
2688; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2689; GFX12:       ; %bb.0:
2690; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2691; GFX12-NEXT:    s_wait_expcnt 0x0
2692; GFX12-NEXT:    s_wait_samplecnt 0x0
2693; GFX12-NEXT:    s_wait_bvhcnt 0x0
2694; GFX12-NEXT:    s_wait_kmcnt 0x0
2695; GFX12-NEXT:    s_wait_storecnt 0x0
2696; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2697; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2698; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2699; GFX12-NEXT:    s_setpc_b64 s[30:31]
2700;
2701; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2702; GFX940:       ; %bb.0:
2703; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2704; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
2705; GFX940-NEXT:    s_nop 1
2706; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
2707; GFX940-NEXT:    buffer_wbl2 sc1
2708; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
2709; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2710; GFX940-NEXT:    buffer_inv sc1
2711; GFX940-NEXT:    s_setpc_b64 s[30:31]
2712;
2713; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2714; GFX11:       ; %bb.0:
2715; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2716; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
2717; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
2718; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2719; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
2720; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2721; GFX11-NEXT:    buffer_gl1_inv
2722; GFX11-NEXT:    buffer_gl0_inv
2723; GFX11-NEXT:    s_setpc_b64 s[30:31]
2724;
2725; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2726; GFX10:       ; %bb.0:
2727; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2728; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
2729; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
2730; GFX10-NEXT:    s_mov_b32 s4, 0
2731; GFX10-NEXT:    flat_load_dword v0, v[3:4]
2732; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
2733; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2734; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2735; GFX10-NEXT:    v_mov_b32_e32 v1, v0
2736; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
2737; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2738; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2739; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2740; GFX10-NEXT:    buffer_gl1_inv
2741; GFX10-NEXT:    buffer_gl0_inv
2742; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
2743; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
2744; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
2745; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
2746; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2747; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2748; GFX10-NEXT:    s_setpc_b64 s[30:31]
2749;
2750; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2751; GFX90A:       ; %bb.0:
2752; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2753; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
2754; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
2755; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
2756; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
2757; GFX90A-NEXT:    ; implicit-def: $vgpr0
2758; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2759; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2760; GFX90A-NEXT:    s_cbranch_execnz .LBB14_3
2761; GFX90A-NEXT:  ; %bb.1: ; %Flow2
2762; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
2763; GFX90A-NEXT:    s_cbranch_execnz .LBB14_8
2764; GFX90A-NEXT:  .LBB14_2: ; %atomicrmw.phi
2765; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2766; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2767; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2768; GFX90A-NEXT:  .LBB14_3: ; %atomicrmw.check.private
2769; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
2770; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v5
2771; GFX90A-NEXT:    ; implicit-def: $vgpr0
2772; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
2773; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
2774; GFX90A-NEXT:    s_cbranch_execz .LBB14_5
2775; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
2776; GFX90A-NEXT:    global_atomic_add_f32 v0, v[4:5], v2, off glc
2777; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2778; GFX90A-NEXT:    buffer_wbinvl1
2779; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
2780; GFX90A-NEXT:    ; implicit-def: $vgpr2
2781; GFX90A-NEXT:  .LBB14_5: ; %Flow
2782; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
2783; GFX90A-NEXT:    s_cbranch_execz .LBB14_7
2784; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
2785; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2786; GFX90A-NEXT:    v_cndmask_b32_e32 v1, -1, v4, vcc
2787; GFX90A-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
2788; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2789; GFX90A-NEXT:    v_add_f32_e32 v2, v0, v2
2790; GFX90A-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
2791; GFX90A-NEXT:  .LBB14_7: ; %Flow1
2792; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
2793; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
2794; GFX90A-NEXT:    ; implicit-def: $vgpr2
2795; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
2796; GFX90A-NEXT:    s_cbranch_execz .LBB14_2
2797; GFX90A-NEXT:  .LBB14_8: ; %atomicrmw.shared
2798; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2799; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc
2800; GFX90A-NEXT:    ds_add_rtn_f32 v0, v0, v2
2801; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
2802; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2803; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2804; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2805;
2806; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2807; GFX908:       ; %bb.0:
2808; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2809; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
2810; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
2811; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
2812; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
2813; GFX908-NEXT:    flat_load_dword v0, v[0:1]
2814; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2815; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
2816; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2817; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2818; GFX908-NEXT:    v_mov_b32_e32 v1, v0
2819; GFX908-NEXT:    v_add_f32_e32 v0, v1, v2
2820; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2821; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2822; GFX908-NEXT:    buffer_wbinvl1
2823; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2824; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2825; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2826; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
2827; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2828; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2829; GFX908-NEXT:    s_setpc_b64 s[30:31]
2830;
2831; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2832; GFX8:       ; %bb.0:
2833; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2834; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
2835; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
2836; GFX8-NEXT:    flat_load_dword v0, v[3:4]
2837; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2838; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
2839; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2840; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2841; GFX8-NEXT:    v_mov_b32_e32 v1, v0
2842; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
2843; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2844; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2845; GFX8-NEXT:    buffer_wbinvl1
2846; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2847; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2848; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2849; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
2850; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2851; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2852; GFX8-NEXT:    s_setpc_b64 s[30:31]
2853;
2854; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2855; GFX7:       ; %bb.0:
2856; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2857; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
2858; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
2859; GFX7-NEXT:    flat_load_dword v0, v[3:4]
2860; GFX7-NEXT:    s_mov_b64 s[4:5], 0
2861; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
2862; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2863; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2864; GFX7-NEXT:    v_mov_b32_e32 v1, v0
2865; GFX7-NEXT:    v_add_f32_e32 v0, v1, v2
2866; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2867; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2868; GFX7-NEXT:    buffer_wbinvl1
2869; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2870; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2871; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2872; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
2873; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2874; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2875; GFX7-NEXT:    s_setpc_b64 s[30:31]
2876  %gep = getelementptr float, ptr %ptr, i64 -512
2877  %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2878  ret float %result
2879}
2880
2881define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
2882; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2883; GFX12:       ; %bb.0:
2884; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2885; GFX12-NEXT:    s_wait_expcnt 0x0
2886; GFX12-NEXT:    s_wait_samplecnt 0x0
2887; GFX12-NEXT:    s_wait_bvhcnt 0x0
2888; GFX12-NEXT:    s_wait_kmcnt 0x0
2889; GFX12-NEXT:    s_wait_storecnt 0x0
2890; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
2891; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
2892; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2893; GFX12-NEXT:    s_setpc_b64 s[30:31]
2894;
2895; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2896; GFX940:       ; %bb.0:
2897; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2898; GFX940-NEXT:    buffer_wbl2 sc1
2899; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
2900; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2901; GFX940-NEXT:    buffer_inv sc1
2902; GFX940-NEXT:    s_setpc_b64 s[30:31]
2903;
2904; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2905; GFX11:       ; %bb.0:
2906; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2907; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2908; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
2909; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2910; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2911; GFX11-NEXT:    buffer_gl1_inv
2912; GFX11-NEXT:    buffer_gl0_inv
2913; GFX11-NEXT:    s_setpc_b64 s[30:31]
2914;
2915; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2916; GFX10:       ; %bb.0:
2917; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2918; GFX10-NEXT:    flat_load_dword v4, v[0:1]
2919; GFX10-NEXT:    s_mov_b32 s4, 0
2920; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
2921; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2922; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2923; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
2924; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2925; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2926; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2927; GFX10-NEXT:    buffer_gl1_inv
2928; GFX10-NEXT:    buffer_gl0_inv
2929; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
2930; GFX10-NEXT:    v_mov_b32_e32 v4, v3
2931; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
2932; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
2933; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
2934; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2935; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2936; GFX10-NEXT:    s_setpc_b64 s[30:31]
2937;
2938; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2939; GFX90A:       ; %bb.0:
2940; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2941; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
2942; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
2943; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2944; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2945; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
2946; GFX90A-NEXT:  ; %bb.1: ; %Flow2
2947; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
2948; GFX90A-NEXT:    s_cbranch_execnz .LBB15_8
2949; GFX90A-NEXT:  .LBB15_2: ; %atomicrmw.phi
2950; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2951; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2952; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2953; GFX90A-NEXT:  .LBB15_3: ; %atomicrmw.check.private
2954; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
2955; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
2956; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
2957; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
2958; GFX90A-NEXT:    s_cbranch_execz .LBB15_5
2959; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
2960; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
2961; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2962; GFX90A-NEXT:    buffer_wbinvl1
2963; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
2964; GFX90A-NEXT:    ; implicit-def: $vgpr2
2965; GFX90A-NEXT:  .LBB15_5: ; %Flow
2966; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
2967; GFX90A-NEXT:    s_cbranch_execz .LBB15_7
2968; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
2969; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2970; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
2971; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
2972; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2973; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
2974; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
2975; GFX90A-NEXT:  .LBB15_7: ; %Flow1
2976; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
2977; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
2978; GFX90A-NEXT:    ; implicit-def: $vgpr2
2979; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
2980; GFX90A-NEXT:    s_cbranch_execz .LBB15_2
2981; GFX90A-NEXT:  .LBB15_8: ; %atomicrmw.shared
2982; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2983; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
2984; GFX90A-NEXT:    ds_add_f32 v0, v2
2985; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
2986; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2987; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2988; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2989;
2990; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2991; GFX908:       ; %bb.0:
2992; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2993; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
2994; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
2995; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2996; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2997; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
2998; GFX908-NEXT:  ; %bb.1: ; %Flow2
2999; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3000; GFX908-NEXT:    s_cbranch_execnz .LBB15_8
3001; GFX908-NEXT:  .LBB15_2: ; %atomicrmw.phi
3002; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3003; GFX908-NEXT:    s_waitcnt vmcnt(0)
3004; GFX908-NEXT:    s_setpc_b64 s[30:31]
3005; GFX908-NEXT:  .LBB15_3: ; %atomicrmw.check.private
3006; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
3007; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
3008; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
3009; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
3010; GFX908-NEXT:    s_cbranch_execz .LBB15_5
3011; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
3012; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
3013; GFX908-NEXT:    s_waitcnt vmcnt(0)
3014; GFX908-NEXT:    buffer_wbinvl1
3015; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
3016; GFX908-NEXT:    ; implicit-def: $vgpr2
3017; GFX908-NEXT:  .LBB15_5: ; %Flow
3018; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
3019; GFX908-NEXT:    s_cbranch_execz .LBB15_7
3020; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
3021; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3022; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3023; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
3024; GFX908-NEXT:    s_waitcnt vmcnt(0)
3025; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
3026; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
3027; GFX908-NEXT:  .LBB15_7: ; %Flow1
3028; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
3029; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
3030; GFX908-NEXT:    ; implicit-def: $vgpr2
3031; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3032; GFX908-NEXT:    s_cbranch_execz .LBB15_2
3033; GFX908-NEXT:  .LBB15_8: ; %atomicrmw.shared
3034; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3035; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3036; GFX908-NEXT:    ds_add_f32 v0, v2
3037; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
3038; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3039; GFX908-NEXT:    s_waitcnt vmcnt(0)
3040; GFX908-NEXT:    s_setpc_b64 s[30:31]
3041;
3042; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
3043; GFX8:       ; %bb.0:
3044; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3045; GFX8-NEXT:    flat_load_dword v4, v[0:1]
3046; GFX8-NEXT:    s_mov_b64 s[4:5], 0
3047; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
3048; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3049; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3050; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
3051; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3052; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3053; GFX8-NEXT:    buffer_wbinvl1
3054; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3055; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3056; GFX8-NEXT:    v_mov_b32_e32 v4, v3
3057; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3058; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
3059; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
3060; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3061; GFX8-NEXT:    s_setpc_b64 s[30:31]
3062;
3063; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
3064; GFX7:       ; %bb.0:
3065; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3066; GFX7-NEXT:    flat_load_dword v4, v[0:1]
3067; GFX7-NEXT:    s_mov_b64 s[4:5], 0
3068; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
3069; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
3070; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3071; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
3072; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3073; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3074; GFX7-NEXT:    buffer_wbinvl1
3075; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3076; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3077; GFX7-NEXT:    v_mov_b32_e32 v4, v3
3078; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3079; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
3080; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
3081; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3082; GFX7-NEXT:    s_setpc_b64 s[30:31]
3083  %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
3084  ret void
3085}
3086
3087define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
3088; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3089; GFX12:       ; %bb.0:
3090; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3091; GFX12-NEXT:    s_wait_expcnt 0x0
3092; GFX12-NEXT:    s_wait_samplecnt 0x0
3093; GFX12-NEXT:    s_wait_bvhcnt 0x0
3094; GFX12-NEXT:    s_wait_kmcnt 0x0
3095; GFX12-NEXT:    s_wait_storecnt 0x0
3096; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
3097; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
3098; GFX12-NEXT:    global_inv scope:SCOPE_DEV
3099; GFX12-NEXT:    s_setpc_b64 s[30:31]
3100;
3101; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3102; GFX940:       ; %bb.0:
3103; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3104; GFX940-NEXT:    buffer_wbl2 sc1
3105; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
3106; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3107; GFX940-NEXT:    buffer_inv sc1
3108; GFX940-NEXT:    s_setpc_b64 s[30:31]
3109;
3110; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3111; GFX11:       ; %bb.0:
3112; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3113; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3114; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
3115; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3116; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3117; GFX11-NEXT:    buffer_gl1_inv
3118; GFX11-NEXT:    buffer_gl0_inv
3119; GFX11-NEXT:    s_setpc_b64 s[30:31]
3120;
3121; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3122; GFX10:       ; %bb.0:
3123; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3124; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
3125; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
3126; GFX10-NEXT:    s_mov_b32 s4, 0
3127; GFX10-NEXT:    flat_load_dword v4, v[0:1]
3128; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
3129; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3130; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3131; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
3132; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3133; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3134; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3135; GFX10-NEXT:    buffer_gl1_inv
3136; GFX10-NEXT:    buffer_gl0_inv
3137; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
3138; GFX10-NEXT:    v_mov_b32_e32 v4, v3
3139; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
3140; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
3141; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
3142; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3143; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3144; GFX10-NEXT:    s_setpc_b64 s[30:31]
3145;
3146; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3147; GFX90A:       ; %bb.0:
3148; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3149; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
3150; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
3151; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
3152; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
3153; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3154; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3155; GFX90A-NEXT:    s_cbranch_execnz .LBB16_3
3156; GFX90A-NEXT:  ; %bb.1: ; %Flow2
3157; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3158; GFX90A-NEXT:    s_cbranch_execnz .LBB16_8
3159; GFX90A-NEXT:  .LBB16_2: ; %atomicrmw.phi
3160; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3161; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3162; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3163; GFX90A-NEXT:  .LBB16_3: ; %atomicrmw.check.private
3164; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
3165; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
3166; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
3167; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
3168; GFX90A-NEXT:    s_cbranch_execz .LBB16_5
3169; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
3170; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
3171; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3172; GFX90A-NEXT:    buffer_wbinvl1
3173; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
3174; GFX90A-NEXT:    ; implicit-def: $vgpr2
3175; GFX90A-NEXT:  .LBB16_5: ; %Flow
3176; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
3177; GFX90A-NEXT:    s_cbranch_execz .LBB16_7
3178; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
3179; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3180; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3181; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
3182; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3183; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
3184; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
3185; GFX90A-NEXT:  .LBB16_7: ; %Flow1
3186; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
3187; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
3188; GFX90A-NEXT:    ; implicit-def: $vgpr2
3189; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3190; GFX90A-NEXT:    s_cbranch_execz .LBB16_2
3191; GFX90A-NEXT:  .LBB16_8: ; %atomicrmw.shared
3192; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3193; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3194; GFX90A-NEXT:    ds_add_f32 v0, v2
3195; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
3196; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3197; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3198; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3199;
3200; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3201; GFX908:       ; %bb.0:
3202; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3203; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
3204; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
3205; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
3206; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
3207; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3208; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3209; GFX908-NEXT:    s_cbranch_execnz .LBB16_3
3210; GFX908-NEXT:  ; %bb.1: ; %Flow2
3211; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3212; GFX908-NEXT:    s_cbranch_execnz .LBB16_8
3213; GFX908-NEXT:  .LBB16_2: ; %atomicrmw.phi
3214; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3215; GFX908-NEXT:    s_waitcnt vmcnt(0)
3216; GFX908-NEXT:    s_setpc_b64 s[30:31]
3217; GFX908-NEXT:  .LBB16_3: ; %atomicrmw.check.private
3218; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
3219; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
3220; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
3221; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
3222; GFX908-NEXT:    s_cbranch_execz .LBB16_5
3223; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
3224; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
3225; GFX908-NEXT:    s_waitcnt vmcnt(0)
3226; GFX908-NEXT:    buffer_wbinvl1
3227; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
3228; GFX908-NEXT:    ; implicit-def: $vgpr2
3229; GFX908-NEXT:  .LBB16_5: ; %Flow
3230; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
3231; GFX908-NEXT:    s_cbranch_execz .LBB16_7
3232; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
3233; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3234; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3235; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
3236; GFX908-NEXT:    s_waitcnt vmcnt(0)
3237; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
3238; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
3239; GFX908-NEXT:  .LBB16_7: ; %Flow1
3240; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
3241; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
3242; GFX908-NEXT:    ; implicit-def: $vgpr2
3243; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3244; GFX908-NEXT:    s_cbranch_execz .LBB16_2
3245; GFX908-NEXT:  .LBB16_8: ; %atomicrmw.shared
3246; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3247; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3248; GFX908-NEXT:    ds_add_f32 v0, v2
3249; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
3250; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3251; GFX908-NEXT:    s_waitcnt vmcnt(0)
3252; GFX908-NEXT:    s_setpc_b64 s[30:31]
3253;
3254; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3255; GFX8:       ; %bb.0:
3256; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3257; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
3258; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3259; GFX8-NEXT:    flat_load_dword v4, v[0:1]
3260; GFX8-NEXT:    s_mov_b64 s[4:5], 0
3261; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
3262; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3263; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3264; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
3265; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3266; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3267; GFX8-NEXT:    buffer_wbinvl1
3268; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3269; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3270; GFX8-NEXT:    v_mov_b32_e32 v4, v3
3271; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3272; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
3273; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
3274; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3275; GFX8-NEXT:    s_setpc_b64 s[30:31]
3276;
3277; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3278; GFX7:       ; %bb.0:
3279; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3280; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
3281; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3282; GFX7-NEXT:    flat_load_dword v4, v[0:1]
3283; GFX7-NEXT:    s_mov_b64 s[4:5], 0
3284; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
3285; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
3286; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3287; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
3288; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3289; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3290; GFX7-NEXT:    buffer_wbinvl1
3291; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3292; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3293; GFX7-NEXT:    v_mov_b32_e32 v4, v3
3294; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3295; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
3296; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
3297; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3298; GFX7-NEXT:    s_setpc_b64 s[30:31]
3299  %gep = getelementptr float, ptr %ptr, i64 511
3300  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
3301  ret void
3302}
3303
3304define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
3305; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
3306; GFX12:       ; %bb.0:
3307; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3308; GFX12-NEXT:    s_wait_expcnt 0x0
3309; GFX12-NEXT:    s_wait_samplecnt 0x0
3310; GFX12-NEXT:    s_wait_bvhcnt 0x0
3311; GFX12-NEXT:    s_wait_kmcnt 0x0
3312; GFX12-NEXT:    s_wait_storecnt 0x0
3313; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
3314; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
3315; GFX12-NEXT:    global_inv scope:SCOPE_DEV
3316; GFX12-NEXT:    s_setpc_b64 s[30:31]
3317;
3318; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
3319; GFX940:       ; %bb.0:
3320; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3321; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
3322; GFX940-NEXT:    s_nop 1
3323; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
3324; GFX940-NEXT:    buffer_wbl2 sc1
3325; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
3326; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3327; GFX940-NEXT:    buffer_inv sc1
3328; GFX940-NEXT:    s_setpc_b64 s[30:31]
3329;
3330; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
3331; GFX11:       ; %bb.0:
3332; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3333; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
3334; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
3335; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3336; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
3337; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3338; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3339; GFX11-NEXT:    buffer_gl1_inv
3340; GFX11-NEXT:    buffer_gl0_inv
3341; GFX11-NEXT:    s_setpc_b64 s[30:31]
3342;
3343; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
3344; GFX10:       ; %bb.0:
3345; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3346; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
3347; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
3348; GFX10-NEXT:    s_mov_b32 s4, 0
3349; GFX10-NEXT:    flat_load_dword v4, v[0:1]
3350; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
3351; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3352; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3353; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
3354; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3355; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3356; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3357; GFX10-NEXT:    buffer_gl1_inv
3358; GFX10-NEXT:    buffer_gl0_inv
3359; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
3360; GFX10-NEXT:    v_mov_b32_e32 v4, v3
3361; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
3362; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
3363; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
3364; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3365; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3366; GFX10-NEXT:    s_setpc_b64 s[30:31]
3367;
3368; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
3369; GFX90A:       ; %bb.0:
3370; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3371; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
3372; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
3373; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
3374; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
3375; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3376; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3377; GFX90A-NEXT:    s_cbranch_execnz .LBB17_3
3378; GFX90A-NEXT:  ; %bb.1: ; %Flow2
3379; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3380; GFX90A-NEXT:    s_cbranch_execnz .LBB17_8
3381; GFX90A-NEXT:  .LBB17_2: ; %atomicrmw.phi
3382; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3383; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3384; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3385; GFX90A-NEXT:  .LBB17_3: ; %atomicrmw.check.private
3386; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
3387; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
3388; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
3389; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
3390; GFX90A-NEXT:    s_cbranch_execz .LBB17_5
3391; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
3392; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
3393; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3394; GFX90A-NEXT:    buffer_wbinvl1
3395; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
3396; GFX90A-NEXT:    ; implicit-def: $vgpr2
3397; GFX90A-NEXT:  .LBB17_5: ; %Flow
3398; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
3399; GFX90A-NEXT:    s_cbranch_execz .LBB17_7
3400; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
3401; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3402; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3403; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
3404; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3405; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
3406; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
3407; GFX90A-NEXT:  .LBB17_7: ; %Flow1
3408; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
3409; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
3410; GFX90A-NEXT:    ; implicit-def: $vgpr2
3411; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3412; GFX90A-NEXT:    s_cbranch_execz .LBB17_2
3413; GFX90A-NEXT:  .LBB17_8: ; %atomicrmw.shared
3414; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3415; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3416; GFX90A-NEXT:    ds_add_f32 v0, v2
3417; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
3418; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3419; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3420; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3421;
3422; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
3423; GFX908:       ; %bb.0:
3424; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3425; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
3426; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
3427; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
3428; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
3429; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3430; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3431; GFX908-NEXT:    s_cbranch_execnz .LBB17_3
3432; GFX908-NEXT:  ; %bb.1: ; %Flow2
3433; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3434; GFX908-NEXT:    s_cbranch_execnz .LBB17_8
3435; GFX908-NEXT:  .LBB17_2: ; %atomicrmw.phi
3436; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3437; GFX908-NEXT:    s_waitcnt vmcnt(0)
3438; GFX908-NEXT:    s_setpc_b64 s[30:31]
3439; GFX908-NEXT:  .LBB17_3: ; %atomicrmw.check.private
3440; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
3441; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
3442; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
3443; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
3444; GFX908-NEXT:    s_cbranch_execz .LBB17_5
3445; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
3446; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
3447; GFX908-NEXT:    s_waitcnt vmcnt(0)
3448; GFX908-NEXT:    buffer_wbinvl1
3449; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
3450; GFX908-NEXT:    ; implicit-def: $vgpr2
3451; GFX908-NEXT:  .LBB17_5: ; %Flow
3452; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
3453; GFX908-NEXT:    s_cbranch_execz .LBB17_7
3454; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
3455; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3456; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3457; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
3458; GFX908-NEXT:    s_waitcnt vmcnt(0)
3459; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
3460; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
3461; GFX908-NEXT:  .LBB17_7: ; %Flow1
3462; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
3463; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
3464; GFX908-NEXT:    ; implicit-def: $vgpr2
3465; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3466; GFX908-NEXT:    s_cbranch_execz .LBB17_2
3467; GFX908-NEXT:  .LBB17_8: ; %atomicrmw.shared
3468; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3469; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3470; GFX908-NEXT:    ds_add_f32 v0, v2
3471; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
3472; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3473; GFX908-NEXT:    s_waitcnt vmcnt(0)
3474; GFX908-NEXT:    s_setpc_b64 s[30:31]
3475;
3476; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
3477; GFX8:       ; %bb.0:
3478; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3479; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
3480; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
3481; GFX8-NEXT:    flat_load_dword v4, v[0:1]
3482; GFX8-NEXT:    s_mov_b64 s[4:5], 0
3483; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
3484; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3485; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3486; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
3487; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3488; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3489; GFX8-NEXT:    buffer_wbinvl1
3490; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3491; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3492; GFX8-NEXT:    v_mov_b32_e32 v4, v3
3493; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3494; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
3495; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
3496; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3497; GFX8-NEXT:    s_setpc_b64 s[30:31]
3498;
3499; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
3500; GFX7:       ; %bb.0:
3501; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3502; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
3503; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
3504; GFX7-NEXT:    flat_load_dword v4, v[0:1]
3505; GFX7-NEXT:    s_mov_b64 s[4:5], 0
3506; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
3507; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
3508; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3509; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
3510; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3511; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3512; GFX7-NEXT:    buffer_wbinvl1
3513; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3514; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3515; GFX7-NEXT:    v_mov_b32_e32 v4, v3
3516; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3517; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
3518; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
3519; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3520; GFX7-NEXT:    s_setpc_b64 s[30:31]
3521  %gep = getelementptr float, ptr %ptr, i64 -512
3522  %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
3523  ret void
3524}
3525
3526define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
3527; GFX12-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3528; GFX12:       ; %bb.0:
3529; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3530; GFX12-NEXT:    s_wait_expcnt 0x0
3531; GFX12-NEXT:    s_wait_samplecnt 0x0
3532; GFX12-NEXT:    s_wait_bvhcnt 0x0
3533; GFX12-NEXT:    s_wait_kmcnt 0x0
3534; GFX12-NEXT:    global_wb scope:SCOPE_SYS
3535; GFX12-NEXT:    s_wait_storecnt 0x0
3536; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3537; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3538; GFX12-NEXT:    global_inv scope:SCOPE_SYS
3539; GFX12-NEXT:    s_setpc_b64 s[30:31]
3540;
3541; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3542; GFX940:       ; %bb.0:
3543; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3544; GFX940-NEXT:    buffer_wbl2 sc0 sc1
3545; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1
3546; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3547; GFX940-NEXT:    buffer_inv sc0 sc1
3548; GFX940-NEXT:    s_setpc_b64 s[30:31]
3549;
3550; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3551; GFX11:       ; %bb.0:
3552; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3553; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3554; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc
3555; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3556; GFX11-NEXT:    buffer_gl1_inv
3557; GFX11-NEXT:    buffer_gl0_inv
3558; GFX11-NEXT:    s_setpc_b64 s[30:31]
3559;
3560; GFX10-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3561; GFX10:       ; %bb.0:
3562; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3563; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
3564; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
3565; GFX10-NEXT:    s_mov_b32 s4, 0
3566; GFX10-NEXT:    flat_load_dword v0, v[3:4]
3567; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
3568; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3569; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3570; GFX10-NEXT:    v_mov_b32_e32 v1, v0
3571; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
3572; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3573; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
3574; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3575; GFX10-NEXT:    buffer_gl1_inv
3576; GFX10-NEXT:    buffer_gl0_inv
3577; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
3578; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
3579; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
3580; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
3581; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3582; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3583; GFX10-NEXT:    s_setpc_b64 s[30:31]
3584;
3585; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3586; GFX90A:       ; %bb.0:
3587; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3588; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
3589; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
3590; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
3591; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
3592; GFX90A-NEXT:    ; implicit-def: $vgpr0
3593; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3594; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3595; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
3596; GFX90A-NEXT:  ; %bb.1: ; %Flow2
3597; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3598; GFX90A-NEXT:    s_cbranch_execnz .LBB18_8
3599; GFX90A-NEXT:  .LBB18_2: ; %atomicrmw.phi
3600; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3601; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3602; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3603; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.check.private
3604; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
3605; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v5
3606; GFX90A-NEXT:    ; implicit-def: $vgpr0
3607; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
3608; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
3609; GFX90A-NEXT:    s_cbranch_execz .LBB18_5
3610; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
3611; GFX90A-NEXT:    buffer_wbl2
3612; GFX90A-NEXT:    global_atomic_add_f32 v0, v[4:5], v2, off glc
3613; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3614; GFX90A-NEXT:    buffer_invl2
3615; GFX90A-NEXT:    buffer_wbinvl1
3616; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
3617; GFX90A-NEXT:    ; implicit-def: $vgpr2
3618; GFX90A-NEXT:  .LBB18_5: ; %Flow
3619; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
3620; GFX90A-NEXT:    s_cbranch_execz .LBB18_7
3621; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
3622; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3623; GFX90A-NEXT:    v_cndmask_b32_e32 v1, -1, v4, vcc
3624; GFX90A-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
3625; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3626; GFX90A-NEXT:    v_add_f32_e32 v2, v0, v2
3627; GFX90A-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
3628; GFX90A-NEXT:  .LBB18_7: ; %Flow1
3629; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
3630; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
3631; GFX90A-NEXT:    ; implicit-def: $vgpr2
3632; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3633; GFX90A-NEXT:    s_cbranch_execz .LBB18_2
3634; GFX90A-NEXT:  .LBB18_8: ; %atomicrmw.shared
3635; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3636; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc
3637; GFX90A-NEXT:    ds_add_rtn_f32 v0, v0, v2
3638; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
3639; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3640; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3641; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3642;
3643; GFX908-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3644; GFX908:       ; %bb.0:
3645; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3646; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
3647; GFX908-NEXT:    s_mov_b64 s[4:5], 0
3648; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
3649; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
3650; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3651; GFX908-NEXT:    v_mov_b32_e32 v4, v3
3652; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
3653; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
3654; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3655; GFX908-NEXT:    buffer_wbinvl1
3656; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3657; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3658; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3659; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
3660; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
3661; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3662; GFX908-NEXT:    v_mov_b32_e32 v0, v3
3663; GFX908-NEXT:    s_setpc_b64 s[30:31]
3664;
3665; GFX8-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3666; GFX8:       ; %bb.0:
3667; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3668; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
3669; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
3670; GFX8-NEXT:    flat_load_dword v0, v[3:4]
3671; GFX8-NEXT:    s_mov_b64 s[4:5], 0
3672; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
3673; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3674; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3675; GFX8-NEXT:    v_mov_b32_e32 v1, v0
3676; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
3677; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
3678; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3679; GFX8-NEXT:    buffer_wbinvl1
3680; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
3681; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3682; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3683; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
3684; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
3685; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3686; GFX8-NEXT:    s_setpc_b64 s[30:31]
3687;
3688; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3689; GFX7:       ; %bb.0:
3690; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3691; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
3692; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
3693; GFX7-NEXT:    flat_load_dword v0, v[3:4]
3694; GFX7-NEXT:    s_mov_b64 s[4:5], 0
3695; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
3696; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
3697; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3698; GFX7-NEXT:    v_mov_b32_e32 v1, v0
3699; GFX7-NEXT:    v_add_f32_e32 v0, v1, v2
3700; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
3701; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3702; GFX7-NEXT:    buffer_wbinvl1
3703; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
3704; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3705; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3706; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
3707; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
3708; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3709; GFX7-NEXT:    s_setpc_b64 s[30:31]
3710  %gep = getelementptr float, ptr %ptr, i64 511
3711  %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
3712  ret float %result
3713}
3714
3715define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
3716; GFX12-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3717; GFX12:       ; %bb.0:
3718; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3719; GFX12-NEXT:    s_wait_expcnt 0x0
3720; GFX12-NEXT:    s_wait_samplecnt 0x0
3721; GFX12-NEXT:    s_wait_bvhcnt 0x0
3722; GFX12-NEXT:    s_wait_kmcnt 0x0
3723; GFX12-NEXT:    global_wb scope:SCOPE_SYS
3724; GFX12-NEXT:    s_wait_storecnt 0x0
3725; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS
3726; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
3727; GFX12-NEXT:    global_inv scope:SCOPE_SYS
3728; GFX12-NEXT:    s_setpc_b64 s[30:31]
3729;
3730; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3731; GFX940:       ; %bb.0:
3732; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3733; GFX940-NEXT:    buffer_wbl2 sc0 sc1
3734; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1
3735; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3736; GFX940-NEXT:    buffer_inv sc0 sc1
3737; GFX940-NEXT:    s_setpc_b64 s[30:31]
3738;
3739; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3740; GFX11:       ; %bb.0:
3741; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3742; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3743; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
3744; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3745; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3746; GFX11-NEXT:    buffer_gl1_inv
3747; GFX11-NEXT:    buffer_gl0_inv
3748; GFX11-NEXT:    s_setpc_b64 s[30:31]
3749;
3750; GFX10-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3751; GFX10:       ; %bb.0:
3752; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3753; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
3754; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
3755; GFX10-NEXT:    s_mov_b32 s4, 0
3756; GFX10-NEXT:    flat_load_dword v4, v[0:1]
3757; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
3758; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3759; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3760; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
3761; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3762; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3763; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3764; GFX10-NEXT:    buffer_gl1_inv
3765; GFX10-NEXT:    buffer_gl0_inv
3766; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
3767; GFX10-NEXT:    v_mov_b32_e32 v4, v3
3768; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
3769; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
3770; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
3771; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3772; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3773; GFX10-NEXT:    s_setpc_b64 s[30:31]
3774;
3775; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3776; GFX90A:       ; %bb.0:
3777; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3778; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
3779; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
3780; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
3781; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
3782; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3783; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3784; GFX90A-NEXT:    s_cbranch_execnz .LBB19_3
3785; GFX90A-NEXT:  ; %bb.1: ; %Flow2
3786; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3787; GFX90A-NEXT:    s_cbranch_execnz .LBB19_8
3788; GFX90A-NEXT:  .LBB19_2: ; %atomicrmw.phi
3789; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3790; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3791; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3792; GFX90A-NEXT:  .LBB19_3: ; %atomicrmw.check.private
3793; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
3794; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
3795; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
3796; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
3797; GFX90A-NEXT:    s_cbranch_execz .LBB19_5
3798; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
3799; GFX90A-NEXT:    buffer_wbl2
3800; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
3801; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3802; GFX90A-NEXT:    buffer_invl2
3803; GFX90A-NEXT:    buffer_wbinvl1
3804; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
3805; GFX90A-NEXT:    ; implicit-def: $vgpr2
3806; GFX90A-NEXT:  .LBB19_5: ; %Flow
3807; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
3808; GFX90A-NEXT:    s_cbranch_execz .LBB19_7
3809; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
3810; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3811; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3812; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
3813; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3814; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
3815; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
3816; GFX90A-NEXT:  .LBB19_7: ; %Flow1
3817; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
3818; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
3819; GFX90A-NEXT:    ; implicit-def: $vgpr2
3820; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3821; GFX90A-NEXT:    s_cbranch_execz .LBB19_2
3822; GFX90A-NEXT:  .LBB19_8: ; %atomicrmw.shared
3823; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3824; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3825; GFX90A-NEXT:    ds_add_f32 v0, v2
3826; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
3827; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3828; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3829; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3830;
3831; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3832; GFX908:       ; %bb.0:
3833; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3834; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
3835; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
3836; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
3837; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
3838; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3839; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3840; GFX908-NEXT:    s_cbranch_execnz .LBB19_3
3841; GFX908-NEXT:  ; %bb.1: ; %Flow2
3842; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3843; GFX908-NEXT:    s_cbranch_execnz .LBB19_8
3844; GFX908-NEXT:  .LBB19_2: ; %atomicrmw.phi
3845; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3846; GFX908-NEXT:    s_waitcnt vmcnt(0)
3847; GFX908-NEXT:    s_setpc_b64 s[30:31]
3848; GFX908-NEXT:  .LBB19_3: ; %atomicrmw.check.private
3849; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
3850; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
3851; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
3852; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
3853; GFX908-NEXT:    s_cbranch_execz .LBB19_5
3854; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
3855; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
3856; GFX908-NEXT:    s_waitcnt vmcnt(0)
3857; GFX908-NEXT:    buffer_wbinvl1
3858; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
3859; GFX908-NEXT:    ; implicit-def: $vgpr2
3860; GFX908-NEXT:  .LBB19_5: ; %Flow
3861; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
3862; GFX908-NEXT:    s_cbranch_execz .LBB19_7
3863; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
3864; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3865; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3866; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
3867; GFX908-NEXT:    s_waitcnt vmcnt(0)
3868; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
3869; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
3870; GFX908-NEXT:  .LBB19_7: ; %Flow1
3871; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
3872; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
3873; GFX908-NEXT:    ; implicit-def: $vgpr2
3874; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3875; GFX908-NEXT:    s_cbranch_execz .LBB19_2
3876; GFX908-NEXT:  .LBB19_8: ; %atomicrmw.shared
3877; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3878; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
3879; GFX908-NEXT:    ds_add_f32 v0, v2
3880; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
3881; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3882; GFX908-NEXT:    s_waitcnt vmcnt(0)
3883; GFX908-NEXT:    s_setpc_b64 s[30:31]
3884;
3885; GFX8-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3886; GFX8:       ; %bb.0:
3887; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3888; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
3889; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3890; GFX8-NEXT:    flat_load_dword v4, v[0:1]
3891; GFX8-NEXT:    s_mov_b64 s[4:5], 0
3892; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
3893; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3894; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3895; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
3896; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3897; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3898; GFX8-NEXT:    buffer_wbinvl1
3899; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3900; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3901; GFX8-NEXT:    v_mov_b32_e32 v4, v3
3902; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3903; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
3904; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
3905; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3906; GFX8-NEXT:    s_setpc_b64 s[30:31]
3907;
3908; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
3909; GFX7:       ; %bb.0:
3910; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3911; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
3912; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3913; GFX7-NEXT:    flat_load_dword v4, v[0:1]
3914; GFX7-NEXT:    s_mov_b64 s[4:5], 0
3915; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
3916; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
3917; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3918; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
3919; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3920; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3921; GFX7-NEXT:    buffer_wbinvl1
3922; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3923; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3924; GFX7-NEXT:    v_mov_b32_e32 v4, v3
3925; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3926; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
3927; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
3928; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3929; GFX7-NEXT:    s_setpc_b64 s[30:31]
3930  %gep = getelementptr float, ptr %ptr, i64 511
3931  %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
3932  ret void
3933}
3934
3935define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
3936; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
3937; GFX12:       ; %bb.0:
3938; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3939; GFX12-NEXT:    s_wait_expcnt 0x0
3940; GFX12-NEXT:    s_wait_samplecnt 0x0
3941; GFX12-NEXT:    s_wait_bvhcnt 0x0
3942; GFX12-NEXT:    s_wait_kmcnt 0x0
3943; GFX12-NEXT:    global_wb scope:SCOPE_SYS
3944; GFX12-NEXT:    s_wait_storecnt 0x0
3945; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3946; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3947; GFX12-NEXT:    global_inv scope:SCOPE_SYS
3948; GFX12-NEXT:    s_setpc_b64 s[30:31]
3949;
3950; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
3951; GFX940:       ; %bb.0:
3952; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3953; GFX940-NEXT:    buffer_wbl2 sc0 sc1
3954; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1
3955; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3956; GFX940-NEXT:    buffer_inv sc0 sc1
3957; GFX940-NEXT:    s_setpc_b64 s[30:31]
3958;
3959; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
3960; GFX11:       ; %bb.0:
3961; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3962; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3963; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc
3964; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3965; GFX11-NEXT:    buffer_gl1_inv
3966; GFX11-NEXT:    buffer_gl0_inv
3967; GFX11-NEXT:    s_setpc_b64 s[30:31]
3968;
3969; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
3970; GFX10:       ; %bb.0:
3971; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3972; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
3973; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
3974; GFX10-NEXT:    s_mov_b32 s4, 0
3975; GFX10-NEXT:    flat_load_dword v0, v[3:4]
3976; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
3977; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3978; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3979; GFX10-NEXT:    v_mov_b32_e32 v1, v0
3980; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
3981; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3982; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
3983; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3984; GFX10-NEXT:    buffer_gl1_inv
3985; GFX10-NEXT:    buffer_gl0_inv
3986; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
3987; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
3988; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
3989; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
3990; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3991; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3992; GFX10-NEXT:    s_setpc_b64 s[30:31]
3993;
3994; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
3995; GFX90A:       ; %bb.0:
3996; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3997; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fc, v0
3998; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
3999; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
4000; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
4001; GFX90A-NEXT:    ; implicit-def: $vgpr0
4002; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4003; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4004; GFX90A-NEXT:    s_cbranch_execnz .LBB20_3
4005; GFX90A-NEXT:  ; %bb.1: ; %Flow2
4006; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4007; GFX90A-NEXT:    s_cbranch_execnz .LBB20_8
4008; GFX90A-NEXT:  .LBB20_2: ; %atomicrmw.phi
4009; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4010; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4011; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4012; GFX90A-NEXT:  .LBB20_3: ; %atomicrmw.check.private
4013; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
4014; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v5
4015; GFX90A-NEXT:    ; implicit-def: $vgpr0
4016; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
4017; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
4018; GFX90A-NEXT:    s_cbranch_execz .LBB20_5
4019; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
4020; GFX90A-NEXT:    buffer_wbl2
4021; GFX90A-NEXT:    global_atomic_add_f32 v0, v[4:5], v2, off glc
4022; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4023; GFX90A-NEXT:    buffer_invl2
4024; GFX90A-NEXT:    buffer_wbinvl1
4025; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
4026; GFX90A-NEXT:    ; implicit-def: $vgpr2
4027; GFX90A-NEXT:  .LBB20_5: ; %Flow
4028; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
4029; GFX90A-NEXT:    s_cbranch_execz .LBB20_7
4030; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
4031; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4032; GFX90A-NEXT:    v_cndmask_b32_e32 v1, -1, v4, vcc
4033; GFX90A-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
4034; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4035; GFX90A-NEXT:    v_add_f32_e32 v2, v0, v2
4036; GFX90A-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
4037; GFX90A-NEXT:  .LBB20_7: ; %Flow1
4038; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
4039; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
4040; GFX90A-NEXT:    ; implicit-def: $vgpr2
4041; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4042; GFX90A-NEXT:    s_cbranch_execz .LBB20_2
4043; GFX90A-NEXT:  .LBB20_8: ; %atomicrmw.shared
4044; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4045; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc
4046; GFX90A-NEXT:    ds_add_rtn_f32 v0, v0, v2
4047; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
4048; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4049; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4050; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4051;
4052; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
4053; GFX908:       ; %bb.0:
4054; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4055; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
4056; GFX908-NEXT:    s_mov_b64 s[4:5], 0
4057; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
4058; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
4059; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4060; GFX908-NEXT:    v_mov_b32_e32 v4, v3
4061; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
4062; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
4063; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4064; GFX908-NEXT:    buffer_wbinvl1
4065; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4066; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4067; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4068; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
4069; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
4070; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4071; GFX908-NEXT:    v_mov_b32_e32 v0, v3
4072; GFX908-NEXT:    s_setpc_b64 s[30:31]
4073;
4074; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
4075; GFX8:       ; %bb.0:
4076; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4077; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
4078; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
4079; GFX8-NEXT:    flat_load_dword v0, v[3:4]
4080; GFX8-NEXT:    s_mov_b64 s[4:5], 0
4081; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
4082; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
4083; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4084; GFX8-NEXT:    v_mov_b32_e32 v1, v0
4085; GFX8-NEXT:    v_add_f32_e32 v0, v1, v2
4086; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
4087; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4088; GFX8-NEXT:    buffer_wbinvl1
4089; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
4090; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4091; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4092; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
4093; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
4094; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4095; GFX8-NEXT:    s_setpc_b64 s[30:31]
4096;
4097; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
4098; GFX7:       ; %bb.0:
4099; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4100; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
4101; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
4102; GFX7-NEXT:    flat_load_dword v0, v[3:4]
4103; GFX7-NEXT:    s_mov_b64 s[4:5], 0
4104; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
4105; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
4106; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4107; GFX7-NEXT:    v_mov_b32_e32 v1, v0
4108; GFX7-NEXT:    v_add_f32_e32 v0, v1, v2
4109; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
4110; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4111; GFX7-NEXT:    buffer_wbinvl1
4112; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
4113; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4114; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4115; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
4116; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
4117; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4118; GFX7-NEXT:    s_setpc_b64 s[30:31]
4119  %gep = getelementptr float, ptr %ptr, i64 511
4120  %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
4121  ret float %result
4122}
4123
4124define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
4125; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
4126; GFX12:       ; %bb.0:
4127; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4128; GFX12-NEXT:    s_wait_expcnt 0x0
4129; GFX12-NEXT:    s_wait_samplecnt 0x0
4130; GFX12-NEXT:    s_wait_bvhcnt 0x0
4131; GFX12-NEXT:    s_wait_kmcnt 0x0
4132; GFX12-NEXT:    global_wb scope:SCOPE_SYS
4133; GFX12-NEXT:    s_wait_storecnt 0x0
4134; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS
4135; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
4136; GFX12-NEXT:    global_inv scope:SCOPE_SYS
4137; GFX12-NEXT:    s_setpc_b64 s[30:31]
4138;
4139; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
4140; GFX940:       ; %bb.0:
4141; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4142; GFX940-NEXT:    buffer_wbl2 sc0 sc1
4143; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1
4144; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4145; GFX940-NEXT:    buffer_inv sc0 sc1
4146; GFX940-NEXT:    s_setpc_b64 s[30:31]
4147;
4148; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
4149; GFX11:       ; %bb.0:
4150; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4151; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4152; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2 offset:2044
4153; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
4154; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4155; GFX11-NEXT:    buffer_gl1_inv
4156; GFX11-NEXT:    buffer_gl0_inv
4157; GFX11-NEXT:    s_setpc_b64 s[30:31]
4158;
4159; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
4160; GFX10:       ; %bb.0:
4161; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4162; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
4163; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
4164; GFX10-NEXT:    s_mov_b32 s4, 0
4165; GFX10-NEXT:    flat_load_dword v4, v[0:1]
4166; GFX10-NEXT:  .LBB21_1: ; %atomicrmw.start
4167; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
4168; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4169; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
4170; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4171; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4172; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4173; GFX10-NEXT:    buffer_gl1_inv
4174; GFX10-NEXT:    buffer_gl0_inv
4175; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
4176; GFX10-NEXT:    v_mov_b32_e32 v4, v3
4177; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
4178; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
4179; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
4180; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
4181; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4182; GFX10-NEXT:    s_setpc_b64 s[30:31]
4183;
4184; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
4185; GFX90A:       ; %bb.0:
4186; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4187; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
4188; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
4189; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
4190; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
4191; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4192; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4193; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
4194; GFX90A-NEXT:  ; %bb.1: ; %Flow2
4195; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4196; GFX90A-NEXT:    s_cbranch_execnz .LBB21_8
4197; GFX90A-NEXT:  .LBB21_2: ; %atomicrmw.phi
4198; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4199; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4200; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4201; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.check.private
4202; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
4203; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
4204; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
4205; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
4206; GFX90A-NEXT:    s_cbranch_execz .LBB21_5
4207; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
4208; GFX90A-NEXT:    buffer_wbl2
4209; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
4210; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4211; GFX90A-NEXT:    buffer_invl2
4212; GFX90A-NEXT:    buffer_wbinvl1
4213; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
4214; GFX90A-NEXT:    ; implicit-def: $vgpr2
4215; GFX90A-NEXT:  .LBB21_5: ; %Flow
4216; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
4217; GFX90A-NEXT:    s_cbranch_execz .LBB21_7
4218; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
4219; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4220; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
4221; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
4222; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4223; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
4224; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
4225; GFX90A-NEXT:  .LBB21_7: ; %Flow1
4226; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
4227; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
4228; GFX90A-NEXT:    ; implicit-def: $vgpr2
4229; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4230; GFX90A-NEXT:    s_cbranch_execz .LBB21_2
4231; GFX90A-NEXT:  .LBB21_8: ; %atomicrmw.shared
4232; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4233; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
4234; GFX90A-NEXT:    ds_add_f32 v0, v2
4235; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
4236; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4237; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4238; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4239;
4240; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
4241; GFX908:       ; %bb.0:
4242; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4243; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
4244; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
4245; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
4246; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
4247; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4248; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4249; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
4250; GFX908-NEXT:  ; %bb.1: ; %Flow2
4251; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4252; GFX908-NEXT:    s_cbranch_execnz .LBB21_8
4253; GFX908-NEXT:  .LBB21_2: ; %atomicrmw.phi
4254; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4255; GFX908-NEXT:    s_waitcnt vmcnt(0)
4256; GFX908-NEXT:    s_setpc_b64 s[30:31]
4257; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.check.private
4258; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
4259; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
4260; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
4261; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
4262; GFX908-NEXT:    s_cbranch_execz .LBB21_5
4263; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
4264; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
4265; GFX908-NEXT:    s_waitcnt vmcnt(0)
4266; GFX908-NEXT:    buffer_wbinvl1
4267; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
4268; GFX908-NEXT:    ; implicit-def: $vgpr2
4269; GFX908-NEXT:  .LBB21_5: ; %Flow
4270; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
4271; GFX908-NEXT:    s_cbranch_execz .LBB21_7
4272; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
4273; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4274; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
4275; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
4276; GFX908-NEXT:    s_waitcnt vmcnt(0)
4277; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
4278; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
4279; GFX908-NEXT:  .LBB21_7: ; %Flow1
4280; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
4281; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
4282; GFX908-NEXT:    ; implicit-def: $vgpr2
4283; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4284; GFX908-NEXT:    s_cbranch_execz .LBB21_2
4285; GFX908-NEXT:  .LBB21_8: ; %atomicrmw.shared
4286; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4287; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
4288; GFX908-NEXT:    ds_add_f32 v0, v2
4289; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
4290; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4291; GFX908-NEXT:    s_waitcnt vmcnt(0)
4292; GFX908-NEXT:    s_setpc_b64 s[30:31]
4293;
4294; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
4295; GFX8:       ; %bb.0:
4296; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4297; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
4298; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4299; GFX8-NEXT:    flat_load_dword v4, v[0:1]
4300; GFX8-NEXT:    s_mov_b64 s[4:5], 0
4301; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
4302; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
4303; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4304; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
4305; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4306; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4307; GFX8-NEXT:    buffer_wbinvl1
4308; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4309; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4310; GFX8-NEXT:    v_mov_b32_e32 v4, v3
4311; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4312; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
4313; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
4314; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4315; GFX8-NEXT:    s_setpc_b64 s[30:31]
4316;
4317; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
4318; GFX7:       ; %bb.0:
4319; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4320; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
4321; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4322; GFX7-NEXT:    flat_load_dword v4, v[0:1]
4323; GFX7-NEXT:    s_mov_b64 s[4:5], 0
4324; GFX7-NEXT:  .LBB21_1: ; %atomicrmw.start
4325; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
4326; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4327; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
4328; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4329; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4330; GFX7-NEXT:    buffer_wbinvl1
4331; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4332; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4333; GFX7-NEXT:    v_mov_b32_e32 v4, v3
4334; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4335; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
4336; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
4337; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4338; GFX7-NEXT:    s_setpc_b64 s[30:31]
4339  %gep = getelementptr float, ptr %ptr, i64 511
4340  %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
4341  ret void
4342}
4343
4344define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
4345; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4346; GFX12:       ; %bb.0:
4347; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4348; GFX12-NEXT:    s_wait_expcnt 0x0
4349; GFX12-NEXT:    s_wait_samplecnt 0x0
4350; GFX12-NEXT:    s_wait_bvhcnt 0x0
4351; GFX12-NEXT:    s_wait_kmcnt 0x0
4352; GFX12-NEXT:    s_wait_storecnt 0x0
4353; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
4354; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4355; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4356; GFX12-NEXT:    s_setpc_b64 s[30:31]
4357;
4358; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4359; GFX940:       ; %bb.0:
4360; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4361; GFX940-NEXT:    buffer_wbl2 sc1
4362; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
4363; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4364; GFX940-NEXT:    buffer_inv sc1
4365; GFX940-NEXT:    s_setpc_b64 s[30:31]
4366;
4367; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4368; GFX11:       ; %bb.0:
4369; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4370; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
4371; GFX11-NEXT:    s_mov_b32 s0, 0
4372; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
4373; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
4374; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4375; GFX11-NEXT:    v_mov_b32_e32 v4, v3
4376; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4377; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
4378; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4379; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
4380; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4381; GFX11-NEXT:    buffer_gl1_inv
4382; GFX11-NEXT:    buffer_gl0_inv
4383; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
4384; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
4385; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4386; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
4387; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
4388; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
4389; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4390; GFX11-NEXT:    v_mov_b32_e32 v0, v3
4391; GFX11-NEXT:    s_setpc_b64 s[30:31]
4392;
4393; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4394; GFX10:       ; %bb.0:
4395; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4396; GFX10-NEXT:    flat_load_dword v3, v[0:1]
4397; GFX10-NEXT:    s_mov_b32 s4, 0
4398; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
4399; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
4400; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4401; GFX10-NEXT:    v_mov_b32_e32 v4, v3
4402; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
4403; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4404; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4405; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4406; GFX10-NEXT:    buffer_gl1_inv
4407; GFX10-NEXT:    buffer_gl0_inv
4408; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
4409; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
4410; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
4411; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
4412; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
4413; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4414; GFX10-NEXT:    v_mov_b32_e32 v0, v3
4415; GFX10-NEXT:    s_setpc_b64 s[30:31]
4416;
4417; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4418; GFX90A:       ; %bb.0:
4419; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4420; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
4421; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
4422; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
4423; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
4424; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4425; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
4426; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
4427; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
4428; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4429; GFX90A-NEXT:    buffer_wbinvl1
4430; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
4431; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4432; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4433; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
4434; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
4435; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4436; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
4437; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4438;
4439; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4440; GFX908:       ; %bb.0:
4441; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4442; GFX908-NEXT:    flat_load_dword v3, v[0:1]
4443; GFX908-NEXT:    s_mov_b64 s[4:5], 0
4444; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
4445; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
4446; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4447; GFX908-NEXT:    v_mov_b32_e32 v4, v3
4448; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
4449; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4450; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4451; GFX908-NEXT:    buffer_wbinvl1
4452; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4453; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4454; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4455; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
4456; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
4457; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4458; GFX908-NEXT:    v_mov_b32_e32 v0, v3
4459; GFX908-NEXT:    s_setpc_b64 s[30:31]
4460;
4461; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4462; GFX8:       ; %bb.0:
4463; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4464; GFX8-NEXT:    flat_load_dword v3, v[0:1]
4465; GFX8-NEXT:    s_mov_b64 s[4:5], 0
4466; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
4467; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
4468; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4469; GFX8-NEXT:    v_mov_b32_e32 v4, v3
4470; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
4471; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4472; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4473; GFX8-NEXT:    buffer_wbinvl1
4474; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4475; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4476; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4477; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
4478; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
4479; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4480; GFX8-NEXT:    v_mov_b32_e32 v0, v3
4481; GFX8-NEXT:    s_setpc_b64 s[30:31]
4482;
4483; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4484; GFX7:       ; %bb.0:
4485; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4486; GFX7-NEXT:    flat_load_dword v3, v[0:1]
4487; GFX7-NEXT:    s_mov_b64 s[4:5], 0
4488; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
4489; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
4490; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4491; GFX7-NEXT:    v_mov_b32_e32 v4, v3
4492; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
4493; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4494; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4495; GFX7-NEXT:    buffer_wbinvl1
4496; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4497; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4498; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4499; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
4500; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
4501; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4502; GFX7-NEXT:    v_mov_b32_e32 v0, v3
4503; GFX7-NEXT:    s_setpc_b64 s[30:31]
4504  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
4505  ret float %result
4506}
4507
4508define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
4509; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4510; GFX12:       ; %bb.0:
4511; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4512; GFX12-NEXT:    s_wait_expcnt 0x0
4513; GFX12-NEXT:    s_wait_samplecnt 0x0
4514; GFX12-NEXT:    s_wait_bvhcnt 0x0
4515; GFX12-NEXT:    s_wait_kmcnt 0x0
4516; GFX12-NEXT:    s_wait_storecnt 0x0
4517; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
4518; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
4519; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4520; GFX12-NEXT:    s_setpc_b64 s[30:31]
4521;
4522; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4523; GFX940:       ; %bb.0:
4524; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4525; GFX940-NEXT:    buffer_wbl2 sc1
4526; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
4527; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4528; GFX940-NEXT:    buffer_inv sc1
4529; GFX940-NEXT:    s_setpc_b64 s[30:31]
4530;
4531; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4532; GFX11:       ; %bb.0:
4533; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4534; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
4535; GFX11-NEXT:    s_mov_b32 s0, 0
4536; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
4537; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
4538; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4539; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
4540; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4541; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
4542; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4543; GFX11-NEXT:    buffer_gl1_inv
4544; GFX11-NEXT:    buffer_gl0_inv
4545; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
4546; GFX11-NEXT:    v_mov_b32_e32 v4, v3
4547; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
4548; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4549; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
4550; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
4551; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
4552; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4553; GFX11-NEXT:    s_setpc_b64 s[30:31]
4554;
4555; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4556; GFX10:       ; %bb.0:
4557; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4558; GFX10-NEXT:    flat_load_dword v4, v[0:1]
4559; GFX10-NEXT:    s_mov_b32 s4, 0
4560; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
4561; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
4562; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4563; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
4564; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4565; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4566; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4567; GFX10-NEXT:    buffer_gl1_inv
4568; GFX10-NEXT:    buffer_gl0_inv
4569; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
4570; GFX10-NEXT:    v_mov_b32_e32 v4, v3
4571; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
4572; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
4573; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
4574; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
4575; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4576; GFX10-NEXT:    s_setpc_b64 s[30:31]
4577;
4578; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4579; GFX90A:       ; %bb.0:
4580; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4581; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
4582; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
4583; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
4584; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
4585; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4586; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
4587; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
4588; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4589; GFX90A-NEXT:    buffer_wbinvl1
4590; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
4591; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4592; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
4593; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4594; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
4595; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
4596; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4597; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4598;
4599; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4600; GFX908:       ; %bb.0:
4601; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4602; GFX908-NEXT:    flat_load_dword v4, v[0:1]
4603; GFX908-NEXT:    s_mov_b64 s[4:5], 0
4604; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
4605; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
4606; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4607; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
4608; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4609; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4610; GFX908-NEXT:    buffer_wbinvl1
4611; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4612; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4613; GFX908-NEXT:    v_mov_b32_e32 v4, v3
4614; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4615; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
4616; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
4617; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4618; GFX908-NEXT:    s_setpc_b64 s[30:31]
4619;
4620; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4621; GFX8:       ; %bb.0:
4622; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4623; GFX8-NEXT:    flat_load_dword v4, v[0:1]
4624; GFX8-NEXT:    s_mov_b64 s[4:5], 0
4625; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
4626; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
4627; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4628; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
4629; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4630; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4631; GFX8-NEXT:    buffer_wbinvl1
4632; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4633; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4634; GFX8-NEXT:    v_mov_b32_e32 v4, v3
4635; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4636; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
4637; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
4638; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4639; GFX8-NEXT:    s_setpc_b64 s[30:31]
4640;
4641; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4642; GFX7:       ; %bb.0:
4643; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4644; GFX7-NEXT:    flat_load_dword v4, v[0:1]
4645; GFX7-NEXT:    s_mov_b64 s[4:5], 0
4646; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
4647; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
4648; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4649; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
4650; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4651; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4652; GFX7-NEXT:    buffer_wbinvl1
4653; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4654; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4655; GFX7-NEXT:    v_mov_b32_e32 v4, v3
4656; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4657; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
4658; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
4659; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4660; GFX7-NEXT:    s_setpc_b64 s[30:31]
4661  %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
4662  ret void
4663}
4664
4665define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
4666; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
4667; GFX12:       ; %bb.0:
4668; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4669; GFX12-NEXT:    s_wait_expcnt 0x0
4670; GFX12-NEXT:    s_wait_samplecnt 0x0
4671; GFX12-NEXT:    s_wait_bvhcnt 0x0
4672; GFX12-NEXT:    s_wait_kmcnt 0x0
4673; GFX12-NEXT:    s_wait_storecnt 0x0
4674; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
4675; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4676; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4677; GFX12-NEXT:    s_setpc_b64 s[30:31]
4678;
4679; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
4680; GFX940:       ; %bb.0:
4681; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4682; GFX940-NEXT:    buffer_wbl2 sc1
4683; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
4684; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4685; GFX940-NEXT:    buffer_inv sc1
4686; GFX940-NEXT:    s_setpc_b64 s[30:31]
4687;
4688; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
4689; GFX11:       ; %bb.0:
4690; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4691; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
4692; GFX11-NEXT:    s_mov_b32 s0, 0
4693; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
4694; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
4695; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4696; GFX11-NEXT:    v_mov_b32_e32 v4, v3
4697; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4698; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
4699; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4700; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
4701; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4702; GFX11-NEXT:    buffer_gl1_inv
4703; GFX11-NEXT:    buffer_gl0_inv
4704; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
4705; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
4706; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4707; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
4708; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
4709; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
4710; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4711; GFX11-NEXT:    v_mov_b32_e32 v0, v3
4712; GFX11-NEXT:    s_setpc_b64 s[30:31]
4713;
4714; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
4715; GFX10:       ; %bb.0:
4716; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4717; GFX10-NEXT:    flat_load_dword v3, v[0:1]
4718; GFX10-NEXT:    s_mov_b32 s4, 0
4719; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
4720; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
4721; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4722; GFX10-NEXT:    v_mov_b32_e32 v4, v3
4723; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
4724; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4725; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4726; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4727; GFX10-NEXT:    buffer_gl1_inv
4728; GFX10-NEXT:    buffer_gl0_inv
4729; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
4730; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
4731; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
4732; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
4733; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
4734; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4735; GFX10-NEXT:    v_mov_b32_e32 v0, v3
4736; GFX10-NEXT:    s_setpc_b64 s[30:31]
4737;
4738; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
4739; GFX90A:       ; %bb.0:
4740; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4741; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
4742; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
4743; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
4744; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
4745; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4746; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
4747; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
4748; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
4749; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4750; GFX90A-NEXT:    buffer_wbinvl1
4751; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
4752; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4753; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4754; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
4755; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
4756; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4757; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
4758; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4759;
4760; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
4761; GFX908:       ; %bb.0:
4762; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4763; GFX908-NEXT:    flat_load_dword v3, v[0:1]
4764; GFX908-NEXT:    s_mov_b64 s[4:5], 0
4765; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
4766; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
4767; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4768; GFX908-NEXT:    v_mov_b32_e32 v4, v3
4769; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
4770; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4771; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4772; GFX908-NEXT:    buffer_wbinvl1
4773; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4774; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4775; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4776; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
4777; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
4778; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4779; GFX908-NEXT:    v_mov_b32_e32 v0, v3
4780; GFX908-NEXT:    s_setpc_b64 s[30:31]
4781;
4782; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
4783; GFX8:       ; %bb.0:
4784; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4785; GFX8-NEXT:    flat_load_dword v3, v[0:1]
4786; GFX8-NEXT:    s_mov_b64 s[4:5], 0
4787; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
4788; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
4789; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4790; GFX8-NEXT:    v_mov_b32_e32 v4, v3
4791; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
4792; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4793; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4794; GFX8-NEXT:    buffer_wbinvl1
4795; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4796; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4797; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4798; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
4799; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
4800; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4801; GFX8-NEXT:    v_mov_b32_e32 v0, v3
4802; GFX8-NEXT:    s_setpc_b64 s[30:31]
4803;
4804; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
4805; GFX7:       ; %bb.0:
4806; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4807; GFX7-NEXT:    flat_load_dword v3, v[0:1]
4808; GFX7-NEXT:    s_mov_b64 s[4:5], 0
4809; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
4810; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
4811; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4812; GFX7-NEXT:    v_mov_b32_e32 v4, v3
4813; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
4814; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4815; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4816; GFX7-NEXT:    buffer_wbinvl1
4817; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4818; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4819; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4820; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
4821; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
4822; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4823; GFX7-NEXT:    v_mov_b32_e32 v0, v3
4824; GFX7-NEXT:    s_setpc_b64 s[30:31]
4825  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
4826  ret float %result
4827}
4828
4829define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
4830; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
4831; GFX12:       ; %bb.0:
4832; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4833; GFX12-NEXT:    s_wait_expcnt 0x0
4834; GFX12-NEXT:    s_wait_samplecnt 0x0
4835; GFX12-NEXT:    s_wait_bvhcnt 0x0
4836; GFX12-NEXT:    s_wait_kmcnt 0x0
4837; GFX12-NEXT:    s_wait_storecnt 0x0
4838; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
4839; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
4840; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4841; GFX12-NEXT:    s_setpc_b64 s[30:31]
4842;
4843; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
4844; GFX940:       ; %bb.0:
4845; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4846; GFX940-NEXT:    buffer_wbl2 sc1
4847; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
4848; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4849; GFX940-NEXT:    buffer_inv sc1
4850; GFX940-NEXT:    s_setpc_b64 s[30:31]
4851;
4852; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
4853; GFX11:       ; %bb.0:
4854; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4855; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
4856; GFX11-NEXT:    s_mov_b32 s0, 0
4857; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
4858; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
4859; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4860; GFX11-NEXT:    v_add_f32_e32 v3, v4, v2
4861; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4862; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
4863; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4864; GFX11-NEXT:    buffer_gl1_inv
4865; GFX11-NEXT:    buffer_gl0_inv
4866; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
4867; GFX11-NEXT:    v_mov_b32_e32 v4, v3
4868; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
4869; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4870; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
4871; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
4872; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
4873; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4874; GFX11-NEXT:    s_setpc_b64 s[30:31]
4875;
4876; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
4877; GFX10:       ; %bb.0:
4878; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4879; GFX10-NEXT:    flat_load_dword v4, v[0:1]
4880; GFX10-NEXT:    s_mov_b32 s4, 0
4881; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
4882; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
4883; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4884; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
4885; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4886; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4887; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4888; GFX10-NEXT:    buffer_gl1_inv
4889; GFX10-NEXT:    buffer_gl0_inv
4890; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
4891; GFX10-NEXT:    v_mov_b32_e32 v4, v3
4892; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
4893; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
4894; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
4895; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
4896; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4897; GFX10-NEXT:    s_setpc_b64 s[30:31]
4898;
4899; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
4900; GFX90A:       ; %bb.0:
4901; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4902; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
4903; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
4904; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
4905; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
4906; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4907; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
4908; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
4909; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4910; GFX90A-NEXT:    buffer_wbinvl1
4911; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
4912; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4913; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
4914; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4915; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
4916; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
4917; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4918; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4919;
4920; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
4921; GFX908:       ; %bb.0:
4922; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4923; GFX908-NEXT:    flat_load_dword v4, v[0:1]
4924; GFX908-NEXT:    s_mov_b64 s[4:5], 0
4925; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
4926; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
4927; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4928; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
4929; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4930; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4931; GFX908-NEXT:    buffer_wbinvl1
4932; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4933; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4934; GFX908-NEXT:    v_mov_b32_e32 v4, v3
4935; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4936; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
4937; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
4938; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4939; GFX908-NEXT:    s_setpc_b64 s[30:31]
4940;
4941; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
4942; GFX8:       ; %bb.0:
4943; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4944; GFX8-NEXT:    flat_load_dword v4, v[0:1]
4945; GFX8-NEXT:    s_mov_b64 s[4:5], 0
4946; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
4947; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
4948; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4949; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
4950; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4951; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4952; GFX8-NEXT:    buffer_wbinvl1
4953; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4954; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4955; GFX8-NEXT:    v_mov_b32_e32 v4, v3
4956; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4957; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
4958; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
4959; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4960; GFX8-NEXT:    s_setpc_b64 s[30:31]
4961;
4962; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
4963; GFX7:       ; %bb.0:
4964; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4965; GFX7-NEXT:    flat_load_dword v4, v[0:1]
4966; GFX7-NEXT:    s_mov_b64 s[4:5], 0
4967; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
4968; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
4969; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4970; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
4971; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4972; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4973; GFX7-NEXT:    buffer_wbinvl1
4974; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4975; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4976; GFX7-NEXT:    v_mov_b32_e32 v4, v3
4977; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4978; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
4979; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
4980; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4981; GFX7-NEXT:    s_setpc_b64 s[30:31]
4982  %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
4983  ret void
4984}
4985
4986define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
4987; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
4988; GFX12:       ; %bb.0:
4989; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4990; GFX12-NEXT:    s_wait_expcnt 0x0
4991; GFX12-NEXT:    s_wait_samplecnt 0x0
4992; GFX12-NEXT:    s_wait_bvhcnt 0x0
4993; GFX12-NEXT:    s_wait_kmcnt 0x0
4994; GFX12-NEXT:    s_wait_storecnt 0x0
4995; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
4996; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4997; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4998; GFX12-NEXT:    s_setpc_b64 s[30:31]
4999;
5000; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5001; GFX940:       ; %bb.0:
5002; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5003; GFX940-NEXT:    buffer_wbl2 sc1
5004; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
5005; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5006; GFX940-NEXT:    buffer_inv sc1
5007; GFX940-NEXT:    s_setpc_b64 s[30:31]
5008;
5009; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5010; GFX11:       ; %bb.0:
5011; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5012; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5013; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
5014; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5015; GFX11-NEXT:    buffer_gl1_inv
5016; GFX11-NEXT:    buffer_gl0_inv
5017; GFX11-NEXT:    s_setpc_b64 s[30:31]
5018;
5019; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5020; GFX10:       ; %bb.0:
5021; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5022; GFX10-NEXT:    flat_load_dword v3, v[0:1]
5023; GFX10-NEXT:    s_mov_b32 s4, 0
5024; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
5025; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
5026; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5027; GFX10-NEXT:    v_mov_b32_e32 v4, v3
5028; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
5029; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5030; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5031; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5032; GFX10-NEXT:    buffer_gl1_inv
5033; GFX10-NEXT:    buffer_gl0_inv
5034; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
5035; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
5036; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
5037; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
5038; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
5039; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5040; GFX10-NEXT:    v_mov_b32_e32 v0, v3
5041; GFX10-NEXT:    s_setpc_b64 s[30:31]
5042;
5043; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5044; GFX90A:       ; %bb.0:
5045; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5046; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
5047; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
5048; GFX90A-NEXT:    ; implicit-def: $vgpr3
5049; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5050; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5051; GFX90A-NEXT:    s_cbranch_execz .LBB26_6
5052; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.check.private
5053; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
5054; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
5055; GFX90A-NEXT:    ; implicit-def: $vgpr3
5056; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
5057; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
5058; GFX90A-NEXT:    s_cbranch_execz .LBB26_3
5059; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.global
5060; GFX90A-NEXT:    global_atomic_add_f32 v3, v[0:1], v2, off glc
5061; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5062; GFX90A-NEXT:    buffer_wbinvl1
5063; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
5064; GFX90A-NEXT:    ; implicit-def: $vgpr2
5065; GFX90A-NEXT:  .LBB26_3: ; %Flow
5066; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
5067; GFX90A-NEXT:    s_cbranch_execz .LBB26_5
5068; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.private
5069; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5070; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
5071; GFX90A-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
5072; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5073; GFX90A-NEXT:    v_add_f32_e32 v1, v3, v2
5074; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
5075; GFX90A-NEXT:  .LBB26_5: ; %Flow1
5076; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
5077; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
5078; GFX90A-NEXT:    ; implicit-def: $vgpr2
5079; GFX90A-NEXT:  .LBB26_6: ; %Flow2
5080; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5081; GFX90A-NEXT:    s_cbranch_execz .LBB26_8
5082; GFX90A-NEXT:  ; %bb.7: ; %atomicrmw.shared
5083; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5084; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
5085; GFX90A-NEXT:    ds_add_rtn_f32 v3, v0, v2
5086; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
5087; GFX90A-NEXT:  .LBB26_8: ; %atomicrmw.phi
5088; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5089; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
5090; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5091; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5092;
5093; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5094; GFX908:       ; %bb.0:
5095; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5096; GFX908-NEXT:    flat_load_dword v3, v[0:1]
5097; GFX908-NEXT:    s_mov_b64 s[4:5], 0
5098; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
5099; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
5100; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5101; GFX908-NEXT:    v_mov_b32_e32 v4, v3
5102; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
5103; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5104; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5105; GFX908-NEXT:    buffer_wbinvl1
5106; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5107; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5108; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5109; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
5110; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
5111; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5112; GFX908-NEXT:    v_mov_b32_e32 v0, v3
5113; GFX908-NEXT:    s_setpc_b64 s[30:31]
5114;
5115; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5116; GFX8:       ; %bb.0:
5117; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5118; GFX8-NEXT:    flat_load_dword v3, v[0:1]
5119; GFX8-NEXT:    s_mov_b64 s[4:5], 0
5120; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
5121; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5122; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5123; GFX8-NEXT:    v_mov_b32_e32 v4, v3
5124; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
5125; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5126; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5127; GFX8-NEXT:    buffer_wbinvl1
5128; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5129; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5130; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5131; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
5132; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
5133; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5134; GFX8-NEXT:    v_mov_b32_e32 v0, v3
5135; GFX8-NEXT:    s_setpc_b64 s[30:31]
5136;
5137; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5138; GFX7:       ; %bb.0:
5139; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5140; GFX7-NEXT:    flat_load_dword v3, v[0:1]
5141; GFX7-NEXT:    s_mov_b64 s[4:5], 0
5142; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
5143; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
5144; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5145; GFX7-NEXT:    v_mov_b32_e32 v4, v3
5146; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
5147; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5148; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5149; GFX7-NEXT:    buffer_wbinvl1
5150; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5151; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5152; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5153; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
5154; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
5155; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5156; GFX7-NEXT:    v_mov_b32_e32 v0, v3
5157; GFX7-NEXT:    s_setpc_b64 s[30:31]
5158  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
5159  ret float %result
5160}
5161
5162define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
5163; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5164; GFX12:       ; %bb.0:
5165; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5166; GFX12-NEXT:    s_wait_expcnt 0x0
5167; GFX12-NEXT:    s_wait_samplecnt 0x0
5168; GFX12-NEXT:    s_wait_bvhcnt 0x0
5169; GFX12-NEXT:    s_wait_kmcnt 0x0
5170; GFX12-NEXT:    s_wait_storecnt 0x0
5171; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
5172; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
5173; GFX12-NEXT:    global_inv scope:SCOPE_DEV
5174; GFX12-NEXT:    s_setpc_b64 s[30:31]
5175;
5176; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5177; GFX940:       ; %bb.0:
5178; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5179; GFX940-NEXT:    buffer_wbl2 sc1
5180; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
5181; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5182; GFX940-NEXT:    buffer_inv sc1
5183; GFX940-NEXT:    s_setpc_b64 s[30:31]
5184;
5185; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5186; GFX11:       ; %bb.0:
5187; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5188; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5189; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
5190; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
5191; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5192; GFX11-NEXT:    buffer_gl1_inv
5193; GFX11-NEXT:    buffer_gl0_inv
5194; GFX11-NEXT:    s_setpc_b64 s[30:31]
5195;
5196; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5197; GFX10:       ; %bb.0:
5198; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5199; GFX10-NEXT:    flat_load_dword v4, v[0:1]
5200; GFX10-NEXT:    s_mov_b32 s4, 0
5201; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
5202; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
5203; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5204; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
5205; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5206; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5207; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5208; GFX10-NEXT:    buffer_gl1_inv
5209; GFX10-NEXT:    buffer_gl0_inv
5210; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
5211; GFX10-NEXT:    v_mov_b32_e32 v4, v3
5212; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
5213; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
5214; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
5215; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
5216; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5217; GFX10-NEXT:    s_setpc_b64 s[30:31]
5218;
5219; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5220; GFX90A:       ; %bb.0:
5221; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5222; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
5223; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
5224; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5225; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5226; GFX90A-NEXT:    s_cbranch_execnz .LBB27_3
5227; GFX90A-NEXT:  ; %bb.1: ; %Flow2
5228; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5229; GFX90A-NEXT:    s_cbranch_execnz .LBB27_8
5230; GFX90A-NEXT:  .LBB27_2: ; %atomicrmw.phi
5231; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5232; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5233; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5234; GFX90A-NEXT:  .LBB27_3: ; %atomicrmw.check.private
5235; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
5236; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
5237; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
5238; GFX90A-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
5239; GFX90A-NEXT:    s_cbranch_execz .LBB27_5
5240; GFX90A-NEXT:  ; %bb.4: ; %atomicrmw.global
5241; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
5242; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5243; GFX90A-NEXT:    buffer_wbinvl1
5244; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
5245; GFX90A-NEXT:    ; implicit-def: $vgpr2
5246; GFX90A-NEXT:  .LBB27_5: ; %Flow
5247; GFX90A-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
5248; GFX90A-NEXT:    s_cbranch_execz .LBB27_7
5249; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.private
5250; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5251; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
5252; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
5253; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5254; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v2
5255; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
5256; GFX90A-NEXT:  .LBB27_7: ; %Flow1
5257; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
5258; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
5259; GFX90A-NEXT:    ; implicit-def: $vgpr2
5260; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5261; GFX90A-NEXT:    s_cbranch_execz .LBB27_2
5262; GFX90A-NEXT:  .LBB27_8: ; %atomicrmw.shared
5263; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5264; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
5265; GFX90A-NEXT:    ds_add_f32 v0, v2
5266; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
5267; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5268; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5269; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5270;
5271; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5272; GFX908:       ; %bb.0:
5273; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5274; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
5275; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
5276; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5277; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5278; GFX908-NEXT:    s_cbranch_execnz .LBB27_3
5279; GFX908-NEXT:  ; %bb.1: ; %Flow2
5280; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5281; GFX908-NEXT:    s_cbranch_execnz .LBB27_8
5282; GFX908-NEXT:  .LBB27_2: ; %atomicrmw.phi
5283; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5284; GFX908-NEXT:    s_waitcnt vmcnt(0)
5285; GFX908-NEXT:    s_setpc_b64 s[30:31]
5286; GFX908-NEXT:  .LBB27_3: ; %atomicrmw.check.private
5287; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
5288; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s7, v1
5289; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
5290; GFX908-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
5291; GFX908-NEXT:    s_cbranch_execz .LBB27_5
5292; GFX908-NEXT:  ; %bb.4: ; %atomicrmw.global
5293; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
5294; GFX908-NEXT:    s_waitcnt vmcnt(0)
5295; GFX908-NEXT:    buffer_wbinvl1
5296; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
5297; GFX908-NEXT:    ; implicit-def: $vgpr2
5298; GFX908-NEXT:  .LBB27_5: ; %Flow
5299; GFX908-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
5300; GFX908-NEXT:    s_cbranch_execz .LBB27_7
5301; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.private
5302; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5303; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
5304; GFX908-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
5305; GFX908-NEXT:    s_waitcnt vmcnt(0)
5306; GFX908-NEXT:    v_add_f32_e32 v1, v1, v2
5307; GFX908-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
5308; GFX908-NEXT:  .LBB27_7: ; %Flow1
5309; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
5310; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
5311; GFX908-NEXT:    ; implicit-def: $vgpr2
5312; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5313; GFX908-NEXT:    s_cbranch_execz .LBB27_2
5314; GFX908-NEXT:  .LBB27_8: ; %atomicrmw.shared
5315; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5316; GFX908-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
5317; GFX908-NEXT:    ds_add_f32 v0, v2
5318; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
5319; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5320; GFX908-NEXT:    s_waitcnt vmcnt(0)
5321; GFX908-NEXT:    s_setpc_b64 s[30:31]
5322;
5323; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5324; GFX8:       ; %bb.0:
5325; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5326; GFX8-NEXT:    flat_load_dword v4, v[0:1]
5327; GFX8-NEXT:    s_mov_b64 s[4:5], 0
5328; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
5329; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5330; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5331; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
5332; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5333; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5334; GFX8-NEXT:    buffer_wbinvl1
5335; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5336; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5337; GFX8-NEXT:    v_mov_b32_e32 v4, v3
5338; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5339; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
5340; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
5341; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5342; GFX8-NEXT:    s_setpc_b64 s[30:31]
5343;
5344; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
5345; GFX7:       ; %bb.0:
5346; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5347; GFX7-NEXT:    flat_load_dword v4, v[0:1]
5348; GFX7-NEXT:    s_mov_b64 s[4:5], 0
5349; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
5350; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
5351; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5352; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
5353; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5354; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5355; GFX7-NEXT:    buffer_wbinvl1
5356; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5357; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5358; GFX7-NEXT:    v_mov_b32_e32 v4, v3
5359; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5360; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
5361; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
5362; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5363; GFX7-NEXT:    s_setpc_b64 s[30:31]
5364  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
5365  ret void
5366}
5367
5368define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
5369; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5370; GFX12:       ; %bb.0:
5371; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5372; GFX12-NEXT:    s_wait_expcnt 0x0
5373; GFX12-NEXT:    s_wait_samplecnt 0x0
5374; GFX12-NEXT:    s_wait_bvhcnt 0x0
5375; GFX12-NEXT:    s_wait_kmcnt 0x0
5376; GFX12-NEXT:    s_wait_storecnt 0x0
5377; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
5378; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5379; GFX12-NEXT:    global_inv scope:SCOPE_DEV
5380; GFX12-NEXT:    s_setpc_b64 s[30:31]
5381;
5382; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5383; GFX940:       ; %bb.0:
5384; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5385; GFX940-NEXT:    buffer_wbl2 sc1
5386; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
5387; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5388; GFX940-NEXT:    buffer_inv sc1
5389; GFX940-NEXT:    s_setpc_b64 s[30:31]
5390;
5391; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5392; GFX11:       ; %bb.0:
5393; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5394; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5395; GFX11-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
5396; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5397; GFX11-NEXT:    buffer_gl1_inv
5398; GFX11-NEXT:    buffer_gl0_inv
5399; GFX11-NEXT:    s_setpc_b64 s[30:31]
5400;
5401; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5402; GFX10:       ; %bb.0:
5403; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5404; GFX10-NEXT:    flat_load_dword v3, v[0:1]
5405; GFX10-NEXT:    s_mov_b32 s4, 0
5406; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
5407; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
5408; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5409; GFX10-NEXT:    v_mov_b32_e32 v4, v3
5410; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
5411; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5412; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5413; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5414; GFX10-NEXT:    buffer_gl1_inv
5415; GFX10-NEXT:    buffer_gl0_inv
5416; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
5417; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
5418; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
5419; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
5420; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
5421; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5422; GFX10-NEXT:    v_mov_b32_e32 v0, v3
5423; GFX10-NEXT:    s_setpc_b64 s[30:31]
5424;
5425; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5426; GFX90A:       ; %bb.0:
5427; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5428; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
5429; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
5430; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
5431; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
5432; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5433; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
5434; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
5435; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
5436; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5437; GFX90A-NEXT:    buffer_wbinvl1
5438; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
5439; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5440; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5441; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
5442; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
5443; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5444; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
5445; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5446;
5447; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5448; GFX908:       ; %bb.0:
5449; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5450; GFX908-NEXT:    flat_load_dword v3, v[0:1]
5451; GFX908-NEXT:    s_mov_b64 s[4:5], 0
5452; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
5453; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
5454; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5455; GFX908-NEXT:    v_mov_b32_e32 v4, v3
5456; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
5457; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5458; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5459; GFX908-NEXT:    buffer_wbinvl1
5460; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5461; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5462; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5463; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
5464; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
5465; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5466; GFX908-NEXT:    v_mov_b32_e32 v0, v3
5467; GFX908-NEXT:    s_setpc_b64 s[30:31]
5468;
5469; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5470; GFX8:       ; %bb.0:
5471; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5472; GFX8-NEXT:    flat_load_dword v3, v[0:1]
5473; GFX8-NEXT:    s_mov_b64 s[4:5], 0
5474; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
5475; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5476; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5477; GFX8-NEXT:    v_mov_b32_e32 v4, v3
5478; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
5479; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5480; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5481; GFX8-NEXT:    buffer_wbinvl1
5482; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5483; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5484; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5485; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
5486; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
5487; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5488; GFX8-NEXT:    v_mov_b32_e32 v0, v3
5489; GFX8-NEXT:    s_setpc_b64 s[30:31]
5490;
5491; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5492; GFX7:       ; %bb.0:
5493; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5494; GFX7-NEXT:    flat_load_dword v3, v[0:1]
5495; GFX7-NEXT:    s_mov_b64 s[4:5], 0
5496; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
5497; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
5498; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5499; GFX7-NEXT:    v_mov_b32_e32 v4, v3
5500; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
5501; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5502; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5503; GFX7-NEXT:    buffer_wbinvl1
5504; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5505; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5506; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5507; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
5508; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
5509; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5510; GFX7-NEXT:    v_mov_b32_e32 v0, v3
5511; GFX7-NEXT:    s_setpc_b64 s[30:31]
5512  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
5513  ret float %result
5514}
5515
5516define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
5517; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5518; GFX12:       ; %bb.0:
5519; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5520; GFX12-NEXT:    s_wait_expcnt 0x0
5521; GFX12-NEXT:    s_wait_samplecnt 0x0
5522; GFX12-NEXT:    s_wait_bvhcnt 0x0
5523; GFX12-NEXT:    s_wait_kmcnt 0x0
5524; GFX12-NEXT:    s_wait_storecnt 0x0
5525; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
5526; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
5527; GFX12-NEXT:    global_inv scope:SCOPE_DEV
5528; GFX12-NEXT:    s_setpc_b64 s[30:31]
5529;
5530; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5531; GFX940:       ; %bb.0:
5532; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5533; GFX940-NEXT:    buffer_wbl2 sc1
5534; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
5535; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5536; GFX940-NEXT:    buffer_inv sc1
5537; GFX940-NEXT:    s_setpc_b64 s[30:31]
5538;
5539; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5540; GFX11:       ; %bb.0:
5541; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5542; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5543; GFX11-NEXT:    flat_atomic_add_f32 v[0:1], v2
5544; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
5545; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5546; GFX11-NEXT:    buffer_gl1_inv
5547; GFX11-NEXT:    buffer_gl0_inv
5548; GFX11-NEXT:    s_setpc_b64 s[30:31]
5549;
5550; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5551; GFX10:       ; %bb.0:
5552; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5553; GFX10-NEXT:    flat_load_dword v4, v[0:1]
5554; GFX10-NEXT:    s_mov_b32 s4, 0
5555; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
5556; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
5557; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5558; GFX10-NEXT:    v_add_f32_e32 v3, v4, v2
5559; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5560; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5561; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5562; GFX10-NEXT:    buffer_gl1_inv
5563; GFX10-NEXT:    buffer_gl0_inv
5564; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
5565; GFX10-NEXT:    v_mov_b32_e32 v4, v3
5566; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
5567; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
5568; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
5569; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
5570; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5571; GFX10-NEXT:    s_setpc_b64 s[30:31]
5572;
5573; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5574; GFX90A:       ; %bb.0:
5575; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5576; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
5577; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
5578; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
5579; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
5580; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5581; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
5582; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
5583; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5584; GFX90A-NEXT:    buffer_wbinvl1
5585; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
5586; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5587; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
5588; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5589; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
5590; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
5591; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5592; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5593;
5594; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5595; GFX908:       ; %bb.0:
5596; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5597; GFX908-NEXT:    flat_load_dword v4, v[0:1]
5598; GFX908-NEXT:    s_mov_b64 s[4:5], 0
5599; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
5600; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
5601; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5602; GFX908-NEXT:    v_add_f32_e32 v3, v4, v2
5603; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5604; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5605; GFX908-NEXT:    buffer_wbinvl1
5606; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5607; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5608; GFX908-NEXT:    v_mov_b32_e32 v4, v3
5609; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5610; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
5611; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
5612; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5613; GFX908-NEXT:    s_setpc_b64 s[30:31]
5614;
5615; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5616; GFX8:       ; %bb.0:
5617; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5618; GFX8-NEXT:    flat_load_dword v4, v[0:1]
5619; GFX8-NEXT:    s_mov_b64 s[4:5], 0
5620; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
5621; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5622; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5623; GFX8-NEXT:    v_add_f32_e32 v3, v4, v2
5624; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5625; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5626; GFX8-NEXT:    buffer_wbinvl1
5627; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5628; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5629; GFX8-NEXT:    v_mov_b32_e32 v4, v3
5630; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5631; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
5632; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
5633; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5634; GFX8-NEXT:    s_setpc_b64 s[30:31]
5635;
5636; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
5637; GFX7:       ; %bb.0:
5638; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5639; GFX7-NEXT:    flat_load_dword v4, v[0:1]
5640; GFX7-NEXT:    s_mov_b64 s[4:5], 0
5641; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
5642; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
5643; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5644; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
5645; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5646; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5647; GFX7-NEXT:    buffer_wbinvl1
5648; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5649; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5650; GFX7-NEXT:    v_mov_b32_e32 v4, v3
5651; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5652; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
5653; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
5654; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5655; GFX7-NEXT:    s_setpc_b64 s[30:31]
5656  %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
5657  ret void
5658}
5659
5660; --------------------------------------------------------------------
5661; double
5662; --------------------------------------------------------------------
5663
5664define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
5665; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
5666; GFX12:       ; %bb.0:
5667; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5668; GFX12-NEXT:    s_wait_expcnt 0x0
5669; GFX12-NEXT:    s_wait_samplecnt 0x0
5670; GFX12-NEXT:    s_wait_bvhcnt 0x0
5671; GFX12-NEXT:    s_wait_kmcnt 0x0
5672; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
5673; GFX12-NEXT:    s_mov_b32 s0, exec_lo
5674; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
5675; GFX12-NEXT:    s_wait_alu 0xfffe
5676; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
5677; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
5678; GFX12-NEXT:    s_cbranch_execz .LBB30_4
5679; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
5680; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
5681; GFX12-NEXT:    s_mov_b32 s1, 0
5682; GFX12-NEXT:  .LBB30_2: ; %atomicrmw.start
5683; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
5684; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5685; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
5686; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5687; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
5688; GFX12-NEXT:    s_wait_storecnt 0x0
5689; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
5690; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5691; GFX12-NEXT:    global_inv scope:SCOPE_DEV
5692; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
5693; GFX12-NEXT:    s_wait_alu 0xfffe
5694; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
5695; GFX12-NEXT:    s_wait_alu 0xfffe
5696; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
5697; GFX12-NEXT:    s_cbranch_execnz .LBB30_2
5698; GFX12-NEXT:  ; %bb.3: ; %Flow
5699; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
5700; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
5701; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
5702; GFX12-NEXT:  .LBB30_4: ; %Flow3
5703; GFX12-NEXT:    s_wait_alu 0xfffe
5704; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
5705; GFX12-NEXT:    s_cbranch_execz .LBB30_6
5706; GFX12-NEXT:  ; %bb.5: ; %atomicrmw.private
5707; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
5708; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
5709; GFX12-NEXT:    scratch_load_b64 v[4:5], v6, off
5710; GFX12-NEXT:    s_wait_loadcnt 0x0
5711; GFX12-NEXT:    v_add_f64_e32 v[0:1], v[4:5], v[2:3]
5712; GFX12-NEXT:    scratch_store_b64 v6, v[0:1], off
5713; GFX12-NEXT:  .LBB30_6: ; %atomicrmw.phi
5714; GFX12-NEXT:    s_wait_alu 0xfffe
5715; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5716; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
5717; GFX12-NEXT:    s_wait_alu 0xfffe
5718; GFX12-NEXT:    s_setpc_b64 s[30:31]
5719;
5720; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
5721; GFX940:       ; %bb.0:
5722; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5723; GFX940-NEXT:    v_mov_b32_e32 v5, v1
5724; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
5725; GFX940-NEXT:    v_mov_b32_e32 v4, v0
5726; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
5727; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
5728; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
5729; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
5730; GFX940-NEXT:    s_cbranch_execnz .LBB30_3
5731; GFX940-NEXT:  ; %bb.1: ; %Flow
5732; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
5733; GFX940-NEXT:    s_cbranch_execnz .LBB30_4
5734; GFX940-NEXT:  .LBB30_2: ; %atomicrmw.phi
5735; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
5736; GFX940-NEXT:    s_setpc_b64 s[30:31]
5737; GFX940-NEXT:  .LBB30_3: ; %atomicrmw.global
5738; GFX940-NEXT:    buffer_wbl2 sc1
5739; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0
5740; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5741; GFX940-NEXT:    buffer_inv sc1
5742; GFX940-NEXT:    ; implicit-def: $vgpr4_vgpr5
5743; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
5744; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
5745; GFX940-NEXT:    s_cbranch_execz .LBB30_2
5746; GFX940-NEXT:  .LBB30_4: ; %atomicrmw.private
5747; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
5748; GFX940-NEXT:    s_nop 1
5749; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
5750; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
5751; GFX940-NEXT:    s_waitcnt vmcnt(0)
5752; GFX940-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
5753; GFX940-NEXT:    scratch_store_dwordx2 v4, v[2:3], off sc0 sc1
5754; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
5755; GFX940-NEXT:    s_waitcnt vmcnt(0)
5756; GFX940-NEXT:    s_setpc_b64 s[30:31]
5757;
5758; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
5759; GFX11:       ; %bb.0:
5760; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5761; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
5762; GFX11-NEXT:    s_mov_b32 s0, exec_lo
5763; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
5764; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
5765; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
5766; GFX11-NEXT:    s_cbranch_execz .LBB30_4
5767; GFX11-NEXT:  ; %bb.1: ; %atomicrmw.global
5768; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
5769; GFX11-NEXT:    s_mov_b32 s1, 0
5770; GFX11-NEXT:  .LBB30_2: ; %atomicrmw.start
5771; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
5772; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5773; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
5774; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5775; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
5776; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5777; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
5778; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5779; GFX11-NEXT:    buffer_gl1_inv
5780; GFX11-NEXT:    buffer_gl0_inv
5781; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
5782; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
5783; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5784; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
5785; GFX11-NEXT:    s_cbranch_execnz .LBB30_2
5786; GFX11-NEXT:  ; %bb.3: ; %Flow
5787; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
5788; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
5789; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
5790; GFX11-NEXT:  .LBB30_4: ; %Flow3
5791; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
5792; GFX11-NEXT:    s_cbranch_execz .LBB30_6
5793; GFX11-NEXT:  ; %bb.5: ; %atomicrmw.private
5794; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
5795; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
5796; GFX11-NEXT:    scratch_load_b64 v[4:5], v6, off
5797; GFX11-NEXT:    s_waitcnt vmcnt(0)
5798; GFX11-NEXT:    v_add_f64 v[0:1], v[4:5], v[2:3]
5799; GFX11-NEXT:    scratch_store_b64 v6, v[0:1], off
5800; GFX11-NEXT:  .LBB30_6: ; %atomicrmw.phi
5801; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5802; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
5803; GFX11-NEXT:    s_setpc_b64 s[30:31]
5804;
5805; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
5806; GFX10:       ; %bb.0:
5807; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5808; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
5809; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
5810; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
5811; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
5812; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
5813; GFX10-NEXT:    s_cbranch_execz .LBB30_4
5814; GFX10-NEXT:  ; %bb.1: ; %atomicrmw.global
5815; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
5816; GFX10-NEXT:    s_mov_b32 s5, 0
5817; GFX10-NEXT:  .LBB30_2: ; %atomicrmw.start
5818; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
5819; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5820; GFX10-NEXT:    v_mov_b32_e32 v7, v5
5821; GFX10-NEXT:    v_mov_b32_e32 v6, v4
5822; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
5823; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5824; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5825; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5826; GFX10-NEXT:    buffer_gl1_inv
5827; GFX10-NEXT:    buffer_gl0_inv
5828; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
5829; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
5830; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
5831; GFX10-NEXT:    s_cbranch_execnz .LBB30_2
5832; GFX10-NEXT:  ; %bb.3: ; %Flow
5833; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
5834; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
5835; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
5836; GFX10-NEXT:  .LBB30_4: ; %Flow3
5837; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
5838; GFX10-NEXT:    s_cbranch_execz .LBB30_6
5839; GFX10-NEXT:  ; %bb.5: ; %atomicrmw.private
5840; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
5841; GFX10-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
5842; GFX10-NEXT:    s_clause 0x1
5843; GFX10-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
5844; GFX10-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
5845; GFX10-NEXT:    s_waitcnt vmcnt(0)
5846; GFX10-NEXT:    v_add_f64 v[0:1], v[4:5], v[2:3]
5847; GFX10-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
5848; GFX10-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
5849; GFX10-NEXT:  .LBB30_6: ; %atomicrmw.phi
5850; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
5851; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5852; GFX10-NEXT:    v_mov_b32_e32 v0, v4
5853; GFX10-NEXT:    v_mov_b32_e32 v1, v5
5854; GFX10-NEXT:    s_setpc_b64 s[30:31]
5855;
5856; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
5857; GFX90A:       ; %bb.0:
5858; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5859; GFX90A-NEXT:    v_mov_b32_e32 v5, v1
5860; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
5861; GFX90A-NEXT:    v_mov_b32_e32 v4, v0
5862; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
5863; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
5864; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5865; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5866; GFX90A-NEXT:    s_cbranch_execnz .LBB30_3
5867; GFX90A-NEXT:  ; %bb.1: ; %Flow
5868; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5869; GFX90A-NEXT:    s_cbranch_execnz .LBB30_4
5870; GFX90A-NEXT:  .LBB30_2: ; %atomicrmw.phi
5871; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5872; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5873; GFX90A-NEXT:  .LBB30_3: ; %atomicrmw.global
5874; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc
5875; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5876; GFX90A-NEXT:    buffer_wbinvl1
5877; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
5878; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
5879; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5880; GFX90A-NEXT:    s_cbranch_execz .LBB30_2
5881; GFX90A-NEXT:  .LBB30_4: ; %atomicrmw.private
5882; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
5883; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
5884; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
5885; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
5886; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5887; GFX90A-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
5888; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
5889; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
5890; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5891; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5892; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5893;
5894; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
5895; GFX908:       ; %bb.0:
5896; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5897; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
5898; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
5899; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
5900; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5901; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5902; GFX908-NEXT:    s_cbranch_execz .LBB30_4
5903; GFX908-NEXT:  ; %bb.1: ; %atomicrmw.global
5904; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
5905; GFX908-NEXT:    s_mov_b64 s[6:7], 0
5906; GFX908-NEXT:  .LBB30_2: ; %atomicrmw.start
5907; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
5908; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5909; GFX908-NEXT:    v_mov_b32_e32 v7, v5
5910; GFX908-NEXT:    v_mov_b32_e32 v6, v4
5911; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
5912; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5913; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5914; GFX908-NEXT:    buffer_wbinvl1
5915; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5916; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5917; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5918; GFX908-NEXT:    s_cbranch_execnz .LBB30_2
5919; GFX908-NEXT:  ; %bb.3: ; %Flow
5920; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
5921; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
5922; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
5923; GFX908-NEXT:  .LBB30_4: ; %Flow3
5924; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5925; GFX908-NEXT:    s_cbranch_execz .LBB30_6
5926; GFX908-NEXT:  ; %bb.5: ; %atomicrmw.private
5927; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5928; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
5929; GFX908-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
5930; GFX908-NEXT:    buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
5931; GFX908-NEXT:    s_waitcnt vmcnt(0)
5932; GFX908-NEXT:    v_add_f64 v[0:1], v[4:5], v[2:3]
5933; GFX908-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
5934; GFX908-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
5935; GFX908-NEXT:  .LBB30_6: ; %atomicrmw.phi
5936; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5937; GFX908-NEXT:    v_mov_b32_e32 v0, v4
5938; GFX908-NEXT:    v_mov_b32_e32 v1, v5
5939; GFX908-NEXT:    s_waitcnt vmcnt(0)
5940; GFX908-NEXT:    s_setpc_b64 s[30:31]
5941;
5942; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
5943; GFX8:       ; %bb.0:
5944; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5945; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
5946; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
5947; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
5948; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5949; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
5950; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5951; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5952; GFX8-NEXT:    s_cbranch_execz .LBB30_4
5953; GFX8-NEXT:  ; %bb.1: ; %atomicrmw.global
5954; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
5955; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
5956; GFX8-NEXT:    flat_load_dword v5, v[4:5]
5957; GFX8-NEXT:    flat_load_dword v4, v[0:1]
5958; GFX8-NEXT:    s_mov_b64 s[6:7], 0
5959; GFX8-NEXT:  .LBB30_2: ; %atomicrmw.start
5960; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5961; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5962; GFX8-NEXT:    v_mov_b32_e32 v7, v5
5963; GFX8-NEXT:    v_mov_b32_e32 v6, v4
5964; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
5965; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5966; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5967; GFX8-NEXT:    buffer_wbinvl1
5968; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5969; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5970; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5971; GFX8-NEXT:    s_cbranch_execnz .LBB30_2
5972; GFX8-NEXT:  ; %bb.3: ; %Flow
5973; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
5974; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5975; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
5976; GFX8-NEXT:  .LBB30_4: ; %Flow3
5977; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5978; GFX8-NEXT:    s_cbranch_execz .LBB30_6
5979; GFX8-NEXT:  ; %bb.5: ; %atomicrmw.private
5980; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5981; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
5982; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
5983; GFX8-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
5984; GFX8-NEXT:    buffer_load_dword v5, v7, s[0:3], 0 offen
5985; GFX8-NEXT:    s_waitcnt vmcnt(0)
5986; GFX8-NEXT:    v_add_f64 v[0:1], v[4:5], v[2:3]
5987; GFX8-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
5988; GFX8-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
5989; GFX8-NEXT:  .LBB30_6: ; %atomicrmw.phi
5990; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5991; GFX8-NEXT:    v_mov_b32_e32 v0, v4
5992; GFX8-NEXT:    v_mov_b32_e32 v1, v5
5993; GFX8-NEXT:    s_waitcnt vmcnt(0)
5994; GFX8-NEXT:    s_setpc_b64 s[30:31]
5995;
5996; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
5997; GFX7:       ; %bb.0:
5998; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5999; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
6000; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
6001; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
6002; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6003; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
6004; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6005; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
6006; GFX7-NEXT:    s_cbranch_execz .LBB30_4
6007; GFX7-NEXT:  ; %bb.1: ; %atomicrmw.global
6008; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
6009; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
6010; GFX7-NEXT:    flat_load_dword v5, v[4:5]
6011; GFX7-NEXT:    flat_load_dword v4, v[0:1]
6012; GFX7-NEXT:    s_mov_b64 s[6:7], 0
6013; GFX7-NEXT:  .LBB30_2: ; %atomicrmw.start
6014; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
6015; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6016; GFX7-NEXT:    v_mov_b32_e32 v7, v5
6017; GFX7-NEXT:    v_mov_b32_e32 v6, v4
6018; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
6019; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6020; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6021; GFX7-NEXT:    buffer_wbinvl1
6022; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6023; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6024; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6025; GFX7-NEXT:    s_cbranch_execnz .LBB30_2
6026; GFX7-NEXT:  ; %bb.3: ; %Flow
6027; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
6028; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
6029; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
6030; GFX7-NEXT:  .LBB30_4: ; %Flow3
6031; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6032; GFX7-NEXT:    s_cbranch_execz .LBB30_6
6033; GFX7-NEXT:  ; %bb.5: ; %atomicrmw.private
6034; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
6035; GFX7-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
6036; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 4, v6
6037; GFX7-NEXT:    buffer_load_dword v4, v6, s[0:3], 0 offen
6038; GFX7-NEXT:    buffer_load_dword v5, v7, s[0:3], 0 offen
6039; GFX7-NEXT:    s_waitcnt vmcnt(0)
6040; GFX7-NEXT:    v_add_f64 v[0:1], v[4:5], v[2:3]
6041; GFX7-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
6042; GFX7-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
6043; GFX7-NEXT:  .LBB30_6: ; %atomicrmw.phi
6044; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6045; GFX7-NEXT:    v_mov_b32_e32 v0, v4
6046; GFX7-NEXT:    v_mov_b32_e32 v1, v5
6047; GFX7-NEXT:    s_waitcnt vmcnt(0)
6048; GFX7-NEXT:    s_setpc_b64 s[30:31]
6049  %result = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
6050  ret double %result
6051}
6052
6053define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
6054; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
6055; GFX12:       ; %bb.0:
6056; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6057; GFX12-NEXT:    s_wait_expcnt 0x0
6058; GFX12-NEXT:    s_wait_samplecnt 0x0
6059; GFX12-NEXT:    s_wait_bvhcnt 0x0
6060; GFX12-NEXT:    s_wait_kmcnt 0x0
6061; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
6062; GFX12-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
6063; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
6064; GFX12-NEXT:    s_mov_b32 s0, exec_lo
6065; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
6066; GFX12-NEXT:    s_wait_alu 0xfffe
6067; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6068; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v5
6069; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
6070; GFX12-NEXT:    s_cbranch_execnz .LBB31_3
6071; GFX12-NEXT:  ; %bb.1: ; %Flow3
6072; GFX12-NEXT:    s_wait_alu 0xfffe
6073; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
6074; GFX12-NEXT:    s_cbranch_execnz .LBB31_6
6075; GFX12-NEXT:  .LBB31_2: ; %atomicrmw.phi
6076; GFX12-NEXT:    s_wait_alu 0xfffe
6077; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6078; GFX12-NEXT:    s_wait_alu 0xfffe
6079; GFX12-NEXT:    s_setpc_b64 s[30:31]
6080; GFX12-NEXT:  .LBB31_3: ; %atomicrmw.global
6081; GFX12-NEXT:    flat_load_b64 v[0:1], v[4:5]
6082; GFX12-NEXT:    s_mov_b32 s1, 0
6083; GFX12-NEXT:  .LBB31_4: ; %atomicrmw.start
6084; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
6085; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6086; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
6087; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6088; GFX12-NEXT:    v_add_f64_e32 v[6:7], v[8:9], v[2:3]
6089; GFX12-NEXT:    s_wait_storecnt 0x0
6090; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
6091; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6092; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6093; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
6094; GFX12-NEXT:    s_wait_alu 0xfffe
6095; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
6096; GFX12-NEXT:    s_wait_alu 0xfffe
6097; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
6098; GFX12-NEXT:    s_cbranch_execnz .LBB31_4
6099; GFX12-NEXT:  ; %bb.5: ; %Flow
6100; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
6101; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
6102; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
6103; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
6104; GFX12-NEXT:    s_cbranch_execz .LBB31_2
6105; GFX12-NEXT:  .LBB31_6: ; %atomicrmw.private
6106; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
6107; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
6108; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
6109; GFX12-NEXT:    s_wait_loadcnt 0x0
6110; GFX12-NEXT:    v_add_f64_e32 v[2:3], v[0:1], v[2:3]
6111; GFX12-NEXT:    scratch_store_b64 v4, v[2:3], off
6112; GFX12-NEXT:    s_wait_alu 0xfffe
6113; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6114; GFX12-NEXT:    s_wait_alu 0xfffe
6115; GFX12-NEXT:    s_setpc_b64 s[30:31]
6116;
6117; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
6118; GFX940:       ; %bb.0:
6119; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6120; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7f8
6121; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
6122; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
6123; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
6124; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
6125; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
6126; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
6127; GFX940-NEXT:    s_cbranch_execnz .LBB31_3
6128; GFX940-NEXT:  ; %bb.1: ; %Flow
6129; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
6130; GFX940-NEXT:    s_cbranch_execnz .LBB31_4
6131; GFX940-NEXT:  .LBB31_2: ; %atomicrmw.phi
6132; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
6133; GFX940-NEXT:    s_setpc_b64 s[30:31]
6134; GFX940-NEXT:  .LBB31_3: ; %atomicrmw.global
6135; GFX940-NEXT:    buffer_wbl2 sc1
6136; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0
6137; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6138; GFX940-NEXT:    buffer_inv sc1
6139; GFX940-NEXT:    ; implicit-def: $vgpr4_vgpr5
6140; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
6141; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
6142; GFX940-NEXT:    s_cbranch_execz .LBB31_2
6143; GFX940-NEXT:  .LBB31_4: ; %atomicrmw.private
6144; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6145; GFX940-NEXT:    s_nop 1
6146; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
6147; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
6148; GFX940-NEXT:    s_waitcnt vmcnt(0)
6149; GFX940-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6150; GFX940-NEXT:    scratch_store_dwordx2 v4, v[2:3], off sc0 sc1
6151; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
6152; GFX940-NEXT:    s_waitcnt vmcnt(0)
6153; GFX940-NEXT:    s_setpc_b64 s[30:31]
6154;
6155; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
6156; GFX11:       ; %bb.0:
6157; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6158; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
6159; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
6160; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
6161; GFX11-NEXT:    s_mov_b32 s0, exec_lo
6162; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
6163; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6164; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v5
6165; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
6166; GFX11-NEXT:    s_cbranch_execnz .LBB31_3
6167; GFX11-NEXT:  ; %bb.1: ; %Flow3
6168; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
6169; GFX11-NEXT:    s_cbranch_execnz .LBB31_6
6170; GFX11-NEXT:  .LBB31_2: ; %atomicrmw.phi
6171; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6172; GFX11-NEXT:    s_setpc_b64 s[30:31]
6173; GFX11-NEXT:  .LBB31_3: ; %atomicrmw.global
6174; GFX11-NEXT:    flat_load_b64 v[0:1], v[4:5]
6175; GFX11-NEXT:    s_mov_b32 s1, 0
6176; GFX11-NEXT:  .LBB31_4: ; %atomicrmw.start
6177; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
6178; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6179; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
6180; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6181; GFX11-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
6182; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
6183; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
6184; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6185; GFX11-NEXT:    buffer_gl1_inv
6186; GFX11-NEXT:    buffer_gl0_inv
6187; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
6188; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
6189; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6190; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
6191; GFX11-NEXT:    s_cbranch_execnz .LBB31_4
6192; GFX11-NEXT:  ; %bb.5: ; %Flow
6193; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
6194; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
6195; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
6196; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
6197; GFX11-NEXT:    s_cbranch_execz .LBB31_2
6198; GFX11-NEXT:  .LBB31_6: ; %atomicrmw.private
6199; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
6200; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
6201; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
6202; GFX11-NEXT:    s_waitcnt vmcnt(0)
6203; GFX11-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6204; GFX11-NEXT:    scratch_store_b64 v4, v[2:3], off
6205; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6206; GFX11-NEXT:    s_setpc_b64 s[30:31]
6207;
6208; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
6209; GFX10:       ; %bb.0:
6210; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6211; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
6212; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
6213; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
6214; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
6215; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v5
6216; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
6217; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
6218; GFX10-NEXT:    s_cbranch_execnz .LBB31_3
6219; GFX10-NEXT:  ; %bb.1: ; %Flow3
6220; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
6221; GFX10-NEXT:    s_cbranch_execnz .LBB31_6
6222; GFX10-NEXT:  .LBB31_2: ; %atomicrmw.phi
6223; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6224; GFX10-NEXT:    s_setpc_b64 s[30:31]
6225; GFX10-NEXT:  .LBB31_3: ; %atomicrmw.global
6226; GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
6227; GFX10-NEXT:    s_mov_b32 s5, 0
6228; GFX10-NEXT:  .LBB31_4: ; %atomicrmw.start
6229; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
6230; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6231; GFX10-NEXT:    v_mov_b32_e32 v9, v1
6232; GFX10-NEXT:    v_mov_b32_e32 v8, v0
6233; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
6234; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
6235; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6236; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6237; GFX10-NEXT:    buffer_gl1_inv
6238; GFX10-NEXT:    buffer_gl0_inv
6239; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
6240; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
6241; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
6242; GFX10-NEXT:    s_cbranch_execnz .LBB31_4
6243; GFX10-NEXT:  ; %bb.5: ; %Flow
6244; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
6245; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
6246; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
6247; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
6248; GFX10-NEXT:    s_cbranch_execz .LBB31_2
6249; GFX10-NEXT:  .LBB31_6: ; %atomicrmw.private
6250; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
6251; GFX10-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
6252; GFX10-NEXT:    s_clause 0x1
6253; GFX10-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
6254; GFX10-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
6255; GFX10-NEXT:    s_waitcnt vmcnt(0)
6256; GFX10-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6257; GFX10-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
6258; GFX10-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
6259; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
6260; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6261; GFX10-NEXT:    s_setpc_b64 s[30:31]
6262;
6263; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
6264; GFX90A:       ; %bb.0:
6265; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6266; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
6267; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
6268; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
6269; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
6270; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
6271; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6272; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
6273; GFX90A-NEXT:    s_cbranch_execnz .LBB31_3
6274; GFX90A-NEXT:  ; %bb.1: ; %Flow
6275; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6276; GFX90A-NEXT:    s_cbranch_execnz .LBB31_4
6277; GFX90A-NEXT:  .LBB31_2: ; %atomicrmw.phi
6278; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
6279; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6280; GFX90A-NEXT:  .LBB31_3: ; %atomicrmw.global
6281; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc
6282; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6283; GFX90A-NEXT:    buffer_wbinvl1
6284; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
6285; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
6286; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6287; GFX90A-NEXT:    s_cbranch_execz .LBB31_2
6288; GFX90A-NEXT:  .LBB31_4: ; %atomicrmw.private
6289; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6290; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
6291; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
6292; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
6293; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6294; GFX90A-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6295; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
6296; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
6297; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
6298; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6299; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6300;
6301; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
6302; GFX908:       ; %bb.0:
6303; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6304; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
6305; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
6306; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
6307; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
6308; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
6309; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6310; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
6311; GFX908-NEXT:    s_cbranch_execnz .LBB31_3
6312; GFX908-NEXT:  ; %bb.1: ; %Flow3
6313; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6314; GFX908-NEXT:    s_cbranch_execnz .LBB31_6
6315; GFX908-NEXT:  .LBB31_2: ; %atomicrmw.phi
6316; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
6317; GFX908-NEXT:    s_setpc_b64 s[30:31]
6318; GFX908-NEXT:  .LBB31_3: ; %atomicrmw.global
6319; GFX908-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
6320; GFX908-NEXT:    s_mov_b64 s[6:7], 0
6321; GFX908-NEXT:  .LBB31_4: ; %atomicrmw.start
6322; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
6323; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6324; GFX908-NEXT:    v_mov_b32_e32 v9, v1
6325; GFX908-NEXT:    v_mov_b32_e32 v8, v0
6326; GFX908-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
6327; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6328; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6329; GFX908-NEXT:    buffer_wbinvl1
6330; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6331; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6332; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6333; GFX908-NEXT:    s_cbranch_execnz .LBB31_4
6334; GFX908-NEXT:  ; %bb.5: ; %Flow
6335; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
6336; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
6337; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
6338; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6339; GFX908-NEXT:    s_cbranch_execz .LBB31_2
6340; GFX908-NEXT:  .LBB31_6: ; %atomicrmw.private
6341; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6342; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
6343; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
6344; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
6345; GFX908-NEXT:    s_waitcnt vmcnt(0)
6346; GFX908-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6347; GFX908-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
6348; GFX908-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
6349; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
6350; GFX908-NEXT:    s_waitcnt vmcnt(0)
6351; GFX908-NEXT:    s_setpc_b64 s[30:31]
6352;
6353; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
6354; GFX8:       ; %bb.0:
6355; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6356; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
6357; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
6358; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
6359; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
6360; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6361; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
6362; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
6363; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6364; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
6365; GFX8-NEXT:    s_cbranch_execnz .LBB31_3
6366; GFX8-NEXT:  ; %bb.1: ; %Flow3
6367; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6368; GFX8-NEXT:    s_cbranch_execnz .LBB31_6
6369; GFX8-NEXT:  .LBB31_2: ; %atomicrmw.phi
6370; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
6371; GFX8-NEXT:    s_setpc_b64 s[30:31]
6372; GFX8-NEXT:  .LBB31_3: ; %atomicrmw.global
6373; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
6374; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
6375; GFX8-NEXT:    flat_load_dword v1, v[0:1]
6376; GFX8-NEXT:    flat_load_dword v0, v[4:5]
6377; GFX8-NEXT:    s_mov_b64 s[6:7], 0
6378; GFX8-NEXT:  .LBB31_4: ; %atomicrmw.start
6379; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
6380; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6381; GFX8-NEXT:    v_mov_b32_e32 v9, v1
6382; GFX8-NEXT:    v_mov_b32_e32 v8, v0
6383; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
6384; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6385; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6386; GFX8-NEXT:    buffer_wbinvl1
6387; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6388; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6389; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6390; GFX8-NEXT:    s_cbranch_execnz .LBB31_4
6391; GFX8-NEXT:  ; %bb.5: ; %Flow
6392; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
6393; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
6394; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
6395; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6396; GFX8-NEXT:    s_cbranch_execz .LBB31_2
6397; GFX8-NEXT:  .LBB31_6: ; %atomicrmw.private
6398; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6399; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
6400; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
6401; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
6402; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
6403; GFX8-NEXT:    s_waitcnt vmcnt(0)
6404; GFX8-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6405; GFX8-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
6406; GFX8-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
6407; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
6408; GFX8-NEXT:    s_waitcnt vmcnt(0)
6409; GFX8-NEXT:    s_setpc_b64 s[30:31]
6410;
6411; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
6412; GFX7:       ; %bb.0:
6413; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6414; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
6415; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
6416; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7f8, v0
6417; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
6418; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6419; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
6420; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
6421; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6422; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
6423; GFX7-NEXT:    s_cbranch_execnz .LBB31_3
6424; GFX7-NEXT:  ; %bb.1: ; %Flow3
6425; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6426; GFX7-NEXT:    s_cbranch_execnz .LBB31_6
6427; GFX7-NEXT:  .LBB31_2: ; %atomicrmw.phi
6428; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6429; GFX7-NEXT:    s_setpc_b64 s[30:31]
6430; GFX7-NEXT:  .LBB31_3: ; %atomicrmw.global
6431; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v4
6432; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
6433; GFX7-NEXT:    flat_load_dword v1, v[0:1]
6434; GFX7-NEXT:    flat_load_dword v0, v[4:5]
6435; GFX7-NEXT:    s_mov_b64 s[6:7], 0
6436; GFX7-NEXT:  .LBB31_4: ; %atomicrmw.start
6437; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
6438; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6439; GFX7-NEXT:    v_mov_b32_e32 v9, v1
6440; GFX7-NEXT:    v_mov_b32_e32 v8, v0
6441; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
6442; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6443; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6444; GFX7-NEXT:    buffer_wbinvl1
6445; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6446; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6447; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6448; GFX7-NEXT:    s_cbranch_execnz .LBB31_4
6449; GFX7-NEXT:  ; %bb.5: ; %Flow
6450; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
6451; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
6452; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
6453; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6454; GFX7-NEXT:    s_cbranch_execz .LBB31_2
6455; GFX7-NEXT:  .LBB31_6: ; %atomicrmw.private
6456; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6457; GFX7-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
6458; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v4
6459; GFX7-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
6460; GFX7-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
6461; GFX7-NEXT:    s_waitcnt vmcnt(0)
6462; GFX7-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6463; GFX7-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
6464; GFX7-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
6465; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6466; GFX7-NEXT:    s_waitcnt vmcnt(0)
6467; GFX7-NEXT:    s_setpc_b64 s[30:31]
6468  %gep = getelementptr double, ptr %ptr, i64 255
6469  %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
6470  ret double %result
6471}
6472
6473define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
6474; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
6475; GFX12:       ; %bb.0:
6476; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6477; GFX12-NEXT:    s_wait_expcnt 0x0
6478; GFX12-NEXT:    s_wait_samplecnt 0x0
6479; GFX12-NEXT:    s_wait_bvhcnt 0x0
6480; GFX12-NEXT:    s_wait_kmcnt 0x0
6481; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
6482; GFX12-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
6483; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
6484; GFX12-NEXT:    s_mov_b32 s0, exec_lo
6485; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
6486; GFX12-NEXT:    s_wait_alu 0xfffe
6487; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6488; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v5
6489; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
6490; GFX12-NEXT:    s_cbranch_execnz .LBB32_3
6491; GFX12-NEXT:  ; %bb.1: ; %Flow3
6492; GFX12-NEXT:    s_wait_alu 0xfffe
6493; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
6494; GFX12-NEXT:    s_cbranch_execnz .LBB32_6
6495; GFX12-NEXT:  .LBB32_2: ; %atomicrmw.phi
6496; GFX12-NEXT:    s_wait_alu 0xfffe
6497; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6498; GFX12-NEXT:    s_wait_alu 0xfffe
6499; GFX12-NEXT:    s_setpc_b64 s[30:31]
6500; GFX12-NEXT:  .LBB32_3: ; %atomicrmw.global
6501; GFX12-NEXT:    flat_load_b64 v[0:1], v[4:5]
6502; GFX12-NEXT:    s_mov_b32 s1, 0
6503; GFX12-NEXT:  .LBB32_4: ; %atomicrmw.start
6504; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
6505; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6506; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
6507; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6508; GFX12-NEXT:    v_add_f64_e32 v[6:7], v[8:9], v[2:3]
6509; GFX12-NEXT:    s_wait_storecnt 0x0
6510; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
6511; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6512; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6513; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
6514; GFX12-NEXT:    s_wait_alu 0xfffe
6515; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
6516; GFX12-NEXT:    s_wait_alu 0xfffe
6517; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
6518; GFX12-NEXT:    s_cbranch_execnz .LBB32_4
6519; GFX12-NEXT:  ; %bb.5: ; %Flow
6520; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
6521; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
6522; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
6523; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
6524; GFX12-NEXT:    s_cbranch_execz .LBB32_2
6525; GFX12-NEXT:  .LBB32_6: ; %atomicrmw.private
6526; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
6527; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
6528; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
6529; GFX12-NEXT:    s_wait_loadcnt 0x0
6530; GFX12-NEXT:    v_add_f64_e32 v[2:3], v[0:1], v[2:3]
6531; GFX12-NEXT:    scratch_store_b64 v4, v[2:3], off
6532; GFX12-NEXT:    s_wait_alu 0xfffe
6533; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6534; GFX12-NEXT:    s_wait_alu 0xfffe
6535; GFX12-NEXT:    s_setpc_b64 s[30:31]
6536;
6537; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
6538; GFX940:       ; %bb.0:
6539; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6540; GFX940-NEXT:    s_movk_i32 s0, 0xf800
6541; GFX940-NEXT:    s_mov_b32 s1, -1
6542; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
6543; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
6544; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
6545; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
6546; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
6547; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
6548; GFX940-NEXT:    s_cbranch_execnz .LBB32_3
6549; GFX940-NEXT:  ; %bb.1: ; %Flow
6550; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
6551; GFX940-NEXT:    s_cbranch_execnz .LBB32_4
6552; GFX940-NEXT:  .LBB32_2: ; %atomicrmw.phi
6553; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
6554; GFX940-NEXT:    s_setpc_b64 s[30:31]
6555; GFX940-NEXT:  .LBB32_3: ; %atomicrmw.global
6556; GFX940-NEXT:    buffer_wbl2 sc1
6557; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0
6558; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6559; GFX940-NEXT:    buffer_inv sc1
6560; GFX940-NEXT:    ; implicit-def: $vgpr4_vgpr5
6561; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
6562; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
6563; GFX940-NEXT:    s_cbranch_execz .LBB32_2
6564; GFX940-NEXT:  .LBB32_4: ; %atomicrmw.private
6565; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6566; GFX940-NEXT:    s_nop 1
6567; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
6568; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
6569; GFX940-NEXT:    s_waitcnt vmcnt(0)
6570; GFX940-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6571; GFX940-NEXT:    scratch_store_dwordx2 v4, v[2:3], off sc0 sc1
6572; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
6573; GFX940-NEXT:    s_waitcnt vmcnt(0)
6574; GFX940-NEXT:    s_setpc_b64 s[30:31]
6575;
6576; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
6577; GFX11:       ; %bb.0:
6578; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6579; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
6580; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
6581; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
6582; GFX11-NEXT:    s_mov_b32 s0, exec_lo
6583; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
6584; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6585; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v5
6586; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
6587; GFX11-NEXT:    s_cbranch_execnz .LBB32_3
6588; GFX11-NEXT:  ; %bb.1: ; %Flow3
6589; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
6590; GFX11-NEXT:    s_cbranch_execnz .LBB32_6
6591; GFX11-NEXT:  .LBB32_2: ; %atomicrmw.phi
6592; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6593; GFX11-NEXT:    s_setpc_b64 s[30:31]
6594; GFX11-NEXT:  .LBB32_3: ; %atomicrmw.global
6595; GFX11-NEXT:    flat_load_b64 v[0:1], v[4:5]
6596; GFX11-NEXT:    s_mov_b32 s1, 0
6597; GFX11-NEXT:  .LBB32_4: ; %atomicrmw.start
6598; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
6599; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6600; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
6601; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6602; GFX11-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
6603; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
6604; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
6605; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6606; GFX11-NEXT:    buffer_gl1_inv
6607; GFX11-NEXT:    buffer_gl0_inv
6608; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
6609; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
6610; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6611; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
6612; GFX11-NEXT:    s_cbranch_execnz .LBB32_4
6613; GFX11-NEXT:  ; %bb.5: ; %Flow
6614; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
6615; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
6616; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
6617; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
6618; GFX11-NEXT:    s_cbranch_execz .LBB32_2
6619; GFX11-NEXT:  .LBB32_6: ; %atomicrmw.private
6620; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
6621; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
6622; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
6623; GFX11-NEXT:    s_waitcnt vmcnt(0)
6624; GFX11-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6625; GFX11-NEXT:    scratch_store_b64 v4, v[2:3], off
6626; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6627; GFX11-NEXT:    s_setpc_b64 s[30:31]
6628;
6629; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
6630; GFX10:       ; %bb.0:
6631; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6632; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
6633; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
6634; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
6635; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
6636; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v5
6637; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
6638; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
6639; GFX10-NEXT:    s_cbranch_execnz .LBB32_3
6640; GFX10-NEXT:  ; %bb.1: ; %Flow3
6641; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
6642; GFX10-NEXT:    s_cbranch_execnz .LBB32_6
6643; GFX10-NEXT:  .LBB32_2: ; %atomicrmw.phi
6644; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6645; GFX10-NEXT:    s_setpc_b64 s[30:31]
6646; GFX10-NEXT:  .LBB32_3: ; %atomicrmw.global
6647; GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
6648; GFX10-NEXT:    s_mov_b32 s5, 0
6649; GFX10-NEXT:  .LBB32_4: ; %atomicrmw.start
6650; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
6651; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6652; GFX10-NEXT:    v_mov_b32_e32 v9, v1
6653; GFX10-NEXT:    v_mov_b32_e32 v8, v0
6654; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
6655; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
6656; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6657; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6658; GFX10-NEXT:    buffer_gl1_inv
6659; GFX10-NEXT:    buffer_gl0_inv
6660; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
6661; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
6662; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
6663; GFX10-NEXT:    s_cbranch_execnz .LBB32_4
6664; GFX10-NEXT:  ; %bb.5: ; %Flow
6665; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
6666; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
6667; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
6668; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
6669; GFX10-NEXT:    s_cbranch_execz .LBB32_2
6670; GFX10-NEXT:  .LBB32_6: ; %atomicrmw.private
6671; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
6672; GFX10-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
6673; GFX10-NEXT:    s_clause 0x1
6674; GFX10-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
6675; GFX10-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
6676; GFX10-NEXT:    s_waitcnt vmcnt(0)
6677; GFX10-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6678; GFX10-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
6679; GFX10-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
6680; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
6681; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6682; GFX10-NEXT:    s_setpc_b64 s[30:31]
6683;
6684; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
6685; GFX90A:       ; %bb.0:
6686; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6687; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
6688; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
6689; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
6690; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
6691; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
6692; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6693; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
6694; GFX90A-NEXT:    s_cbranch_execnz .LBB32_3
6695; GFX90A-NEXT:  ; %bb.1: ; %Flow
6696; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6697; GFX90A-NEXT:    s_cbranch_execnz .LBB32_4
6698; GFX90A-NEXT:  .LBB32_2: ; %atomicrmw.phi
6699; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
6700; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6701; GFX90A-NEXT:  .LBB32_3: ; %atomicrmw.global
6702; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc
6703; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6704; GFX90A-NEXT:    buffer_wbinvl1
6705; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
6706; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
6707; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6708; GFX90A-NEXT:    s_cbranch_execz .LBB32_2
6709; GFX90A-NEXT:  .LBB32_4: ; %atomicrmw.private
6710; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6711; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
6712; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
6713; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
6714; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6715; GFX90A-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6716; GFX90A-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
6717; GFX90A-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
6718; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
6719; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6720; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6721;
6722; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
6723; GFX908:       ; %bb.0:
6724; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6725; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
6726; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
6727; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
6728; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
6729; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
6730; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6731; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
6732; GFX908-NEXT:    s_cbranch_execnz .LBB32_3
6733; GFX908-NEXT:  ; %bb.1: ; %Flow3
6734; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6735; GFX908-NEXT:    s_cbranch_execnz .LBB32_6
6736; GFX908-NEXT:  .LBB32_2: ; %atomicrmw.phi
6737; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
6738; GFX908-NEXT:    s_setpc_b64 s[30:31]
6739; GFX908-NEXT:  .LBB32_3: ; %atomicrmw.global
6740; GFX908-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
6741; GFX908-NEXT:    s_mov_b64 s[6:7], 0
6742; GFX908-NEXT:  .LBB32_4: ; %atomicrmw.start
6743; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
6744; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6745; GFX908-NEXT:    v_mov_b32_e32 v9, v1
6746; GFX908-NEXT:    v_mov_b32_e32 v8, v0
6747; GFX908-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
6748; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6749; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6750; GFX908-NEXT:    buffer_wbinvl1
6751; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6752; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6753; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6754; GFX908-NEXT:    s_cbranch_execnz .LBB32_4
6755; GFX908-NEXT:  ; %bb.5: ; %Flow
6756; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
6757; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
6758; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
6759; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6760; GFX908-NEXT:    s_cbranch_execz .LBB32_2
6761; GFX908-NEXT:  .LBB32_6: ; %atomicrmw.private
6762; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6763; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
6764; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
6765; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
6766; GFX908-NEXT:    s_waitcnt vmcnt(0)
6767; GFX908-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6768; GFX908-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
6769; GFX908-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
6770; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
6771; GFX908-NEXT:    s_waitcnt vmcnt(0)
6772; GFX908-NEXT:    s_setpc_b64 s[30:31]
6773;
6774; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
6775; GFX8:       ; %bb.0:
6776; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6777; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
6778; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
6779; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
6780; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
6781; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6782; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
6783; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
6784; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6785; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
6786; GFX8-NEXT:    s_cbranch_execnz .LBB32_3
6787; GFX8-NEXT:  ; %bb.1: ; %Flow3
6788; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6789; GFX8-NEXT:    s_cbranch_execnz .LBB32_6
6790; GFX8-NEXT:  .LBB32_2: ; %atomicrmw.phi
6791; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
6792; GFX8-NEXT:    s_setpc_b64 s[30:31]
6793; GFX8-NEXT:  .LBB32_3: ; %atomicrmw.global
6794; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
6795; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
6796; GFX8-NEXT:    flat_load_dword v1, v[0:1]
6797; GFX8-NEXT:    flat_load_dword v0, v[4:5]
6798; GFX8-NEXT:    s_mov_b64 s[6:7], 0
6799; GFX8-NEXT:  .LBB32_4: ; %atomicrmw.start
6800; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
6801; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6802; GFX8-NEXT:    v_mov_b32_e32 v9, v1
6803; GFX8-NEXT:    v_mov_b32_e32 v8, v0
6804; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
6805; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6806; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6807; GFX8-NEXT:    buffer_wbinvl1
6808; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6809; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6810; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6811; GFX8-NEXT:    s_cbranch_execnz .LBB32_4
6812; GFX8-NEXT:  ; %bb.5: ; %Flow
6813; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
6814; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
6815; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
6816; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6817; GFX8-NEXT:    s_cbranch_execz .LBB32_2
6818; GFX8-NEXT:  .LBB32_6: ; %atomicrmw.private
6819; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6820; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
6821; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
6822; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
6823; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
6824; GFX8-NEXT:    s_waitcnt vmcnt(0)
6825; GFX8-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6826; GFX8-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
6827; GFX8-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
6828; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
6829; GFX8-NEXT:    s_waitcnt vmcnt(0)
6830; GFX8-NEXT:    s_setpc_b64 s[30:31]
6831;
6832; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
6833; GFX7:       ; %bb.0:
6834; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6835; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
6836; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
6837; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
6838; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
6839; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6840; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
6841; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
6842; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
6843; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
6844; GFX7-NEXT:    s_cbranch_execnz .LBB32_3
6845; GFX7-NEXT:  ; %bb.1: ; %Flow3
6846; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6847; GFX7-NEXT:    s_cbranch_execnz .LBB32_6
6848; GFX7-NEXT:  .LBB32_2: ; %atomicrmw.phi
6849; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6850; GFX7-NEXT:    s_setpc_b64 s[30:31]
6851; GFX7-NEXT:  .LBB32_3: ; %atomicrmw.global
6852; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v4
6853; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
6854; GFX7-NEXT:    flat_load_dword v1, v[0:1]
6855; GFX7-NEXT:    flat_load_dword v0, v[4:5]
6856; GFX7-NEXT:    s_mov_b64 s[6:7], 0
6857; GFX7-NEXT:  .LBB32_4: ; %atomicrmw.start
6858; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
6859; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6860; GFX7-NEXT:    v_mov_b32_e32 v9, v1
6861; GFX7-NEXT:    v_mov_b32_e32 v8, v0
6862; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
6863; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6864; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6865; GFX7-NEXT:    buffer_wbinvl1
6866; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6867; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6868; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6869; GFX7-NEXT:    s_cbranch_execnz .LBB32_4
6870; GFX7-NEXT:  ; %bb.5: ; %Flow
6871; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
6872; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
6873; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
6874; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
6875; GFX7-NEXT:    s_cbranch_execz .LBB32_2
6876; GFX7-NEXT:  .LBB32_6: ; %atomicrmw.private
6877; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6878; GFX7-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
6879; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v4
6880; GFX7-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
6881; GFX7-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
6882; GFX7-NEXT:    s_waitcnt vmcnt(0)
6883; GFX7-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
6884; GFX7-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
6885; GFX7-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
6886; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6887; GFX7-NEXT:    s_waitcnt vmcnt(0)
6888; GFX7-NEXT:    s_setpc_b64 s[30:31]
6889  %gep = getelementptr double, ptr %ptr, i64 -256
6890  %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
6891  ret double %result
6892}
6893
6894define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
6895; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
6896; GFX12:       ; %bb.0:
6897; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6898; GFX12-NEXT:    s_wait_expcnt 0x0
6899; GFX12-NEXT:    s_wait_samplecnt 0x0
6900; GFX12-NEXT:    s_wait_bvhcnt 0x0
6901; GFX12-NEXT:    s_wait_kmcnt 0x0
6902; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
6903; GFX12-NEXT:    s_mov_b32 s0, exec_lo
6904; GFX12-NEXT:    s_wait_alu 0xfffe
6905; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
6906; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
6907; GFX12-NEXT:    s_cbranch_execnz .LBB33_3
6908; GFX12-NEXT:  ; %bb.1: ; %Flow3
6909; GFX12-NEXT:    s_wait_alu 0xfffe
6910; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
6911; GFX12-NEXT:    s_cbranch_execnz .LBB33_6
6912; GFX12-NEXT:  .LBB33_2: ; %atomicrmw.phi
6913; GFX12-NEXT:    s_wait_alu 0xfffe
6914; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6915; GFX12-NEXT:    s_wait_alu 0xfffe
6916; GFX12-NEXT:    s_setpc_b64 s[30:31]
6917; GFX12-NEXT:  .LBB33_3: ; %atomicrmw.global
6918; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
6919; GFX12-NEXT:    s_mov_b32 s1, 0
6920; GFX12-NEXT:  .LBB33_4: ; %atomicrmw.start
6921; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
6922; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6923; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
6924; GFX12-NEXT:    s_wait_storecnt 0x0
6925; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
6926; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6927; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6928; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
6929; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
6930; GFX12-NEXT:    s_wait_alu 0xfffe
6931; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
6932; GFX12-NEXT:    s_wait_alu 0xfffe
6933; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
6934; GFX12-NEXT:    s_cbranch_execnz .LBB33_4
6935; GFX12-NEXT:  ; %bb.5: ; %Flow
6936; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
6937; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
6938; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
6939; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
6940; GFX12-NEXT:    s_cbranch_execz .LBB33_2
6941; GFX12-NEXT:  .LBB33_6: ; %atomicrmw.private
6942; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
6943; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
6944; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
6945; GFX12-NEXT:    s_wait_loadcnt 0x0
6946; GFX12-NEXT:    v_add_f64_e32 v[0:1], v[0:1], v[2:3]
6947; GFX12-NEXT:    scratch_store_b64 v4, v[0:1], off
6948; GFX12-NEXT:    s_wait_alu 0xfffe
6949; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6950; GFX12-NEXT:    s_wait_alu 0xfffe
6951; GFX12-NEXT:    s_setpc_b64 s[30:31]
6952;
6953; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
6954; GFX940:       ; %bb.0:
6955; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6956; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
6957; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
6958; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
6959; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
6960; GFX940-NEXT:    s_cbranch_execnz .LBB33_3
6961; GFX940-NEXT:  ; %bb.1: ; %Flow
6962; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
6963; GFX940-NEXT:    s_cbranch_execnz .LBB33_4
6964; GFX940-NEXT:  .LBB33_2: ; %atomicrmw.phi
6965; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
6966; GFX940-NEXT:    s_setpc_b64 s[30:31]
6967; GFX940-NEXT:  .LBB33_3: ; %atomicrmw.global
6968; GFX940-NEXT:    buffer_wbl2 sc1
6969; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
6970; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6971; GFX940-NEXT:    buffer_inv sc1
6972; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
6973; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
6974; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
6975; GFX940-NEXT:    s_cbranch_execz .LBB33_2
6976; GFX940-NEXT:  .LBB33_4: ; %atomicrmw.private
6977; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
6978; GFX940-NEXT:    s_nop 1
6979; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
6980; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
6981; GFX940-NEXT:    s_waitcnt vmcnt(0)
6982; GFX940-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
6983; GFX940-NEXT:    scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
6984; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
6985; GFX940-NEXT:    s_waitcnt vmcnt(0)
6986; GFX940-NEXT:    s_setpc_b64 s[30:31]
6987;
6988; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
6989; GFX11:       ; %bb.0:
6990; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6991; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
6992; GFX11-NEXT:    s_mov_b32 s0, exec_lo
6993; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
6994; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
6995; GFX11-NEXT:    s_cbranch_execnz .LBB33_3
6996; GFX11-NEXT:  ; %bb.1: ; %Flow3
6997; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
6998; GFX11-NEXT:    s_cbranch_execnz .LBB33_6
6999; GFX11-NEXT:  .LBB33_2: ; %atomicrmw.phi
7000; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7001; GFX11-NEXT:    s_setpc_b64 s[30:31]
7002; GFX11-NEXT:  .LBB33_3: ; %atomicrmw.global
7003; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
7004; GFX11-NEXT:    s_mov_b32 s1, 0
7005; GFX11-NEXT:  .LBB33_4: ; %atomicrmw.start
7006; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7007; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7008; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
7009; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7010; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
7011; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7012; GFX11-NEXT:    buffer_gl1_inv
7013; GFX11-NEXT:    buffer_gl0_inv
7014; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
7015; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
7016; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
7017; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7018; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
7019; GFX11-NEXT:    s_cbranch_execnz .LBB33_4
7020; GFX11-NEXT:  ; %bb.5: ; %Flow
7021; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
7022; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
7023; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
7024; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
7025; GFX11-NEXT:    s_cbranch_execz .LBB33_2
7026; GFX11-NEXT:  .LBB33_6: ; %atomicrmw.private
7027; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
7028; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
7029; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
7030; GFX11-NEXT:    s_waitcnt vmcnt(0)
7031; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7032; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off
7033; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7034; GFX11-NEXT:    s_setpc_b64 s[30:31]
7035;
7036; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
7037; GFX10:       ; %bb.0:
7038; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7039; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
7040; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
7041; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
7042; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
7043; GFX10-NEXT:    s_cbranch_execnz .LBB33_3
7044; GFX10-NEXT:  ; %bb.1: ; %Flow3
7045; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
7046; GFX10-NEXT:    s_cbranch_execnz .LBB33_6
7047; GFX10-NEXT:  .LBB33_2: ; %atomicrmw.phi
7048; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7049; GFX10-NEXT:    s_setpc_b64 s[30:31]
7050; GFX10-NEXT:  .LBB33_3: ; %atomicrmw.global
7051; GFX10-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
7052; GFX10-NEXT:    s_mov_b32 s5, 0
7053; GFX10-NEXT:  .LBB33_4: ; %atomicrmw.start
7054; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7055; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7056; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
7057; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7058; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7059; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7060; GFX10-NEXT:    buffer_gl1_inv
7061; GFX10-NEXT:    buffer_gl0_inv
7062; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
7063; GFX10-NEXT:    v_mov_b32_e32 v7, v5
7064; GFX10-NEXT:    v_mov_b32_e32 v6, v4
7065; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
7066; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
7067; GFX10-NEXT:    s_cbranch_execnz .LBB33_4
7068; GFX10-NEXT:  ; %bb.5: ; %Flow
7069; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
7070; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
7071; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
7072; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
7073; GFX10-NEXT:    s_cbranch_execz .LBB33_2
7074; GFX10-NEXT:  .LBB33_6: ; %atomicrmw.private
7075; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
7076; GFX10-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
7077; GFX10-NEXT:    s_clause 0x1
7078; GFX10-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
7079; GFX10-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
7080; GFX10-NEXT:    s_waitcnt vmcnt(0)
7081; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7082; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
7083; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
7084; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
7085; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7086; GFX10-NEXT:    s_setpc_b64 s[30:31]
7087;
7088; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
7089; GFX90A:       ; %bb.0:
7090; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7091; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
7092; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
7093; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
7094; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
7095; GFX90A-NEXT:    s_cbranch_execnz .LBB33_3
7096; GFX90A-NEXT:  ; %bb.1: ; %Flow
7097; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7098; GFX90A-NEXT:    s_cbranch_execnz .LBB33_4
7099; GFX90A-NEXT:  .LBB33_2: ; %atomicrmw.phi
7100; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7101; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7102; GFX90A-NEXT:  .LBB33_3: ; %atomicrmw.global
7103; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
7104; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7105; GFX90A-NEXT:    buffer_wbinvl1
7106; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
7107; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
7108; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7109; GFX90A-NEXT:    s_cbranch_execz .LBB33_2
7110; GFX90A-NEXT:  .LBB33_4: ; %atomicrmw.private
7111; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7112; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
7113; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
7114; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
7115; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7116; GFX90A-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7117; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
7118; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
7119; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7120; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7121; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7122;
7123; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
7124; GFX908:       ; %bb.0:
7125; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7126; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
7127; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
7128; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
7129; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
7130; GFX908-NEXT:    s_cbranch_execnz .LBB33_3
7131; GFX908-NEXT:  ; %bb.1: ; %Flow3
7132; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7133; GFX908-NEXT:    s_cbranch_execnz .LBB33_6
7134; GFX908-NEXT:  .LBB33_2: ; %atomicrmw.phi
7135; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7136; GFX908-NEXT:    s_setpc_b64 s[30:31]
7137; GFX908-NEXT:  .LBB33_3: ; %atomicrmw.global
7138; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
7139; GFX908-NEXT:    s_mov_b64 s[6:7], 0
7140; GFX908-NEXT:  .LBB33_4: ; %atomicrmw.start
7141; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7142; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7143; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
7144; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7145; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7146; GFX908-NEXT:    buffer_wbinvl1
7147; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7148; GFX908-NEXT:    v_mov_b32_e32 v7, v5
7149; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7150; GFX908-NEXT:    v_mov_b32_e32 v6, v4
7151; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7152; GFX908-NEXT:    s_cbranch_execnz .LBB33_4
7153; GFX908-NEXT:  ; %bb.5: ; %Flow
7154; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
7155; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
7156; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
7157; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7158; GFX908-NEXT:    s_cbranch_execz .LBB33_2
7159; GFX908-NEXT:  .LBB33_6: ; %atomicrmw.private
7160; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7161; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
7162; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
7163; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
7164; GFX908-NEXT:    s_waitcnt vmcnt(0)
7165; GFX908-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7166; GFX908-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
7167; GFX908-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
7168; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7169; GFX908-NEXT:    s_waitcnt vmcnt(0)
7170; GFX908-NEXT:    s_setpc_b64 s[30:31]
7171;
7172; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
7173; GFX8:       ; %bb.0:
7174; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7175; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
7176; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
7177; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
7178; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
7179; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
7180; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
7181; GFX8-NEXT:    s_cbranch_execnz .LBB33_3
7182; GFX8-NEXT:  ; %bb.1: ; %Flow3
7183; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7184; GFX8-NEXT:    s_cbranch_execnz .LBB33_6
7185; GFX8-NEXT:  .LBB33_2: ; %atomicrmw.phi
7186; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7187; GFX8-NEXT:    s_setpc_b64 s[30:31]
7188; GFX8-NEXT:  .LBB33_3: ; %atomicrmw.global
7189; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
7190; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
7191; GFX8-NEXT:    flat_load_dword v7, v[4:5]
7192; GFX8-NEXT:    flat_load_dword v6, v[0:1]
7193; GFX8-NEXT:    s_mov_b64 s[6:7], 0
7194; GFX8-NEXT:  .LBB33_4: ; %atomicrmw.start
7195; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
7196; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7197; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
7198; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7199; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7200; GFX8-NEXT:    buffer_wbinvl1
7201; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7202; GFX8-NEXT:    v_mov_b32_e32 v7, v5
7203; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7204; GFX8-NEXT:    v_mov_b32_e32 v6, v4
7205; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7206; GFX8-NEXT:    s_cbranch_execnz .LBB33_4
7207; GFX8-NEXT:  ; %bb.5: ; %Flow
7208; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
7209; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
7210; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
7211; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7212; GFX8-NEXT:    s_cbranch_execz .LBB33_2
7213; GFX8-NEXT:  .LBB33_6: ; %atomicrmw.private
7214; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7215; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
7216; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
7217; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
7218; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
7219; GFX8-NEXT:    s_waitcnt vmcnt(0)
7220; GFX8-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7221; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
7222; GFX8-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
7223; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7224; GFX8-NEXT:    s_waitcnt vmcnt(0)
7225; GFX8-NEXT:    s_setpc_b64 s[30:31]
7226;
7227; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
7228; GFX7:       ; %bb.0:
7229; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7230; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
7231; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
7232; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7233; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
7234; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
7235; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
7236; GFX7-NEXT:    s_cbranch_execnz .LBB33_3
7237; GFX7-NEXT:  ; %bb.1: ; %Flow3
7238; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7239; GFX7-NEXT:    s_cbranch_execnz .LBB33_6
7240; GFX7-NEXT:  .LBB33_2: ; %atomicrmw.phi
7241; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7242; GFX7-NEXT:    s_setpc_b64 s[30:31]
7243; GFX7-NEXT:  .LBB33_3: ; %atomicrmw.global
7244; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
7245; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
7246; GFX7-NEXT:    flat_load_dword v7, v[4:5]
7247; GFX7-NEXT:    flat_load_dword v6, v[0:1]
7248; GFX7-NEXT:    s_mov_b64 s[6:7], 0
7249; GFX7-NEXT:  .LBB33_4: ; %atomicrmw.start
7250; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
7251; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7252; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
7253; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7254; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7255; GFX7-NEXT:    buffer_wbinvl1
7256; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7257; GFX7-NEXT:    v_mov_b32_e32 v7, v5
7258; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7259; GFX7-NEXT:    v_mov_b32_e32 v6, v4
7260; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7261; GFX7-NEXT:    s_cbranch_execnz .LBB33_4
7262; GFX7-NEXT:  ; %bb.5: ; %Flow
7263; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
7264; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
7265; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
7266; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7267; GFX7-NEXT:    s_cbranch_execz .LBB33_2
7268; GFX7-NEXT:  .LBB33_6: ; %atomicrmw.private
7269; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7270; GFX7-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
7271; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v4
7272; GFX7-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
7273; GFX7-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
7274; GFX7-NEXT:    s_waitcnt vmcnt(0)
7275; GFX7-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7276; GFX7-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
7277; GFX7-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
7278; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7279; GFX7-NEXT:    s_waitcnt vmcnt(0)
7280; GFX7-NEXT:    s_setpc_b64 s[30:31]
7281  %unused = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
7282  ret void
7283}
7284
7285define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
7286; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
7287; GFX12:       ; %bb.0:
7288; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7289; GFX12-NEXT:    s_wait_expcnt 0x0
7290; GFX12-NEXT:    s_wait_samplecnt 0x0
7291; GFX12-NEXT:    s_wait_bvhcnt 0x0
7292; GFX12-NEXT:    s_wait_kmcnt 0x0
7293; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
7294; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
7295; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
7296; GFX12-NEXT:    s_mov_b32 s0, exec_lo
7297; GFX12-NEXT:    s_wait_alu 0xfffe
7298; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7299; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
7300; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
7301; GFX12-NEXT:    s_cbranch_execnz .LBB34_3
7302; GFX12-NEXT:  ; %bb.1: ; %Flow3
7303; GFX12-NEXT:    s_wait_alu 0xfffe
7304; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
7305; GFX12-NEXT:    s_cbranch_execnz .LBB34_6
7306; GFX12-NEXT:  .LBB34_2: ; %atomicrmw.phi
7307; GFX12-NEXT:    s_wait_alu 0xfffe
7308; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7309; GFX12-NEXT:    s_wait_alu 0xfffe
7310; GFX12-NEXT:    s_setpc_b64 s[30:31]
7311; GFX12-NEXT:  .LBB34_3: ; %atomicrmw.global
7312; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
7313; GFX12-NEXT:    s_mov_b32 s1, 0
7314; GFX12-NEXT:  .LBB34_4: ; %atomicrmw.start
7315; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
7316; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7317; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
7318; GFX12-NEXT:    s_wait_storecnt 0x0
7319; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
7320; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7321; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7322; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
7323; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
7324; GFX12-NEXT:    s_wait_alu 0xfffe
7325; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
7326; GFX12-NEXT:    s_wait_alu 0xfffe
7327; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
7328; GFX12-NEXT:    s_cbranch_execnz .LBB34_4
7329; GFX12-NEXT:  ; %bb.5: ; %Flow
7330; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
7331; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
7332; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
7333; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
7334; GFX12-NEXT:    s_cbranch_execz .LBB34_2
7335; GFX12-NEXT:  .LBB34_6: ; %atomicrmw.private
7336; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
7337; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
7338; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
7339; GFX12-NEXT:    s_wait_loadcnt 0x0
7340; GFX12-NEXT:    v_add_f64_e32 v[0:1], v[0:1], v[2:3]
7341; GFX12-NEXT:    scratch_store_b64 v4, v[0:1], off
7342; GFX12-NEXT:    s_wait_alu 0xfffe
7343; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7344; GFX12-NEXT:    s_wait_alu 0xfffe
7345; GFX12-NEXT:    s_setpc_b64 s[30:31]
7346;
7347; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
7348; GFX940:       ; %bb.0:
7349; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7350; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7f8
7351; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
7352; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
7353; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
7354; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
7355; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
7356; GFX940-NEXT:    s_cbranch_execnz .LBB34_3
7357; GFX940-NEXT:  ; %bb.1: ; %Flow
7358; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
7359; GFX940-NEXT:    s_cbranch_execnz .LBB34_4
7360; GFX940-NEXT:  .LBB34_2: ; %atomicrmw.phi
7361; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
7362; GFX940-NEXT:    s_setpc_b64 s[30:31]
7363; GFX940-NEXT:  .LBB34_3: ; %atomicrmw.global
7364; GFX940-NEXT:    buffer_wbl2 sc1
7365; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
7366; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7367; GFX940-NEXT:    buffer_inv sc1
7368; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
7369; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
7370; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
7371; GFX940-NEXT:    s_cbranch_execz .LBB34_2
7372; GFX940-NEXT:  .LBB34_4: ; %atomicrmw.private
7373; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7374; GFX940-NEXT:    s_nop 1
7375; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
7376; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
7377; GFX940-NEXT:    s_waitcnt vmcnt(0)
7378; GFX940-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7379; GFX940-NEXT:    scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
7380; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
7381; GFX940-NEXT:    s_waitcnt vmcnt(0)
7382; GFX940-NEXT:    s_setpc_b64 s[30:31]
7383;
7384; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
7385; GFX11:       ; %bb.0:
7386; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7387; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
7388; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
7389; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
7390; GFX11-NEXT:    s_mov_b32 s0, exec_lo
7391; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7392; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
7393; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
7394; GFX11-NEXT:    s_cbranch_execnz .LBB34_3
7395; GFX11-NEXT:  ; %bb.1: ; %Flow3
7396; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
7397; GFX11-NEXT:    s_cbranch_execnz .LBB34_6
7398; GFX11-NEXT:  .LBB34_2: ; %atomicrmw.phi
7399; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7400; GFX11-NEXT:    s_setpc_b64 s[30:31]
7401; GFX11-NEXT:  .LBB34_3: ; %atomicrmw.global
7402; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
7403; GFX11-NEXT:    s_mov_b32 s1, 0
7404; GFX11-NEXT:  .LBB34_4: ; %atomicrmw.start
7405; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7406; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7407; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
7408; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7409; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
7410; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7411; GFX11-NEXT:    buffer_gl1_inv
7412; GFX11-NEXT:    buffer_gl0_inv
7413; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
7414; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
7415; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
7416; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7417; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
7418; GFX11-NEXT:    s_cbranch_execnz .LBB34_4
7419; GFX11-NEXT:  ; %bb.5: ; %Flow
7420; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
7421; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
7422; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
7423; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
7424; GFX11-NEXT:    s_cbranch_execz .LBB34_2
7425; GFX11-NEXT:  .LBB34_6: ; %atomicrmw.private
7426; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
7427; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
7428; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
7429; GFX11-NEXT:    s_waitcnt vmcnt(0)
7430; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7431; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off
7432; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7433; GFX11-NEXT:    s_setpc_b64 s[30:31]
7434;
7435; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
7436; GFX10:       ; %bb.0:
7437; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7438; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
7439; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
7440; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
7441; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
7442; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
7443; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
7444; GFX10-NEXT:    s_cbranch_execnz .LBB34_3
7445; GFX10-NEXT:  ; %bb.1: ; %Flow3
7446; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
7447; GFX10-NEXT:    s_cbranch_execnz .LBB34_6
7448; GFX10-NEXT:  .LBB34_2: ; %atomicrmw.phi
7449; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7450; GFX10-NEXT:    s_setpc_b64 s[30:31]
7451; GFX10-NEXT:  .LBB34_3: ; %atomicrmw.global
7452; GFX10-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
7453; GFX10-NEXT:    s_mov_b32 s5, 0
7454; GFX10-NEXT:  .LBB34_4: ; %atomicrmw.start
7455; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7456; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7457; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
7458; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7459; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7460; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7461; GFX10-NEXT:    buffer_gl1_inv
7462; GFX10-NEXT:    buffer_gl0_inv
7463; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
7464; GFX10-NEXT:    v_mov_b32_e32 v7, v5
7465; GFX10-NEXT:    v_mov_b32_e32 v6, v4
7466; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
7467; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
7468; GFX10-NEXT:    s_cbranch_execnz .LBB34_4
7469; GFX10-NEXT:  ; %bb.5: ; %Flow
7470; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
7471; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
7472; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
7473; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
7474; GFX10-NEXT:    s_cbranch_execz .LBB34_2
7475; GFX10-NEXT:  .LBB34_6: ; %atomicrmw.private
7476; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
7477; GFX10-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
7478; GFX10-NEXT:    s_clause 0x1
7479; GFX10-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
7480; GFX10-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
7481; GFX10-NEXT:    s_waitcnt vmcnt(0)
7482; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7483; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
7484; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
7485; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
7486; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7487; GFX10-NEXT:    s_setpc_b64 s[30:31]
7488;
7489; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
7490; GFX90A:       ; %bb.0:
7491; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7492; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
7493; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7494; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
7495; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
7496; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
7497; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
7498; GFX90A-NEXT:    s_cbranch_execnz .LBB34_3
7499; GFX90A-NEXT:  ; %bb.1: ; %Flow
7500; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7501; GFX90A-NEXT:    s_cbranch_execnz .LBB34_4
7502; GFX90A-NEXT:  .LBB34_2: ; %atomicrmw.phi
7503; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7504; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7505; GFX90A-NEXT:  .LBB34_3: ; %atomicrmw.global
7506; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
7507; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7508; GFX90A-NEXT:    buffer_wbinvl1
7509; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
7510; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
7511; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7512; GFX90A-NEXT:    s_cbranch_execz .LBB34_2
7513; GFX90A-NEXT:  .LBB34_4: ; %atomicrmw.private
7514; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7515; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
7516; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
7517; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
7518; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7519; GFX90A-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7520; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
7521; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
7522; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7523; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7524; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7525;
7526; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
7527; GFX908:       ; %bb.0:
7528; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7529; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
7530; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7531; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
7532; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
7533; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
7534; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
7535; GFX908-NEXT:    s_cbranch_execnz .LBB34_3
7536; GFX908-NEXT:  ; %bb.1: ; %Flow3
7537; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7538; GFX908-NEXT:    s_cbranch_execnz .LBB34_6
7539; GFX908-NEXT:  .LBB34_2: ; %atomicrmw.phi
7540; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7541; GFX908-NEXT:    s_setpc_b64 s[30:31]
7542; GFX908-NEXT:  .LBB34_3: ; %atomicrmw.global
7543; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
7544; GFX908-NEXT:    s_mov_b64 s[6:7], 0
7545; GFX908-NEXT:  .LBB34_4: ; %atomicrmw.start
7546; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7547; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7548; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
7549; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7550; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7551; GFX908-NEXT:    buffer_wbinvl1
7552; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7553; GFX908-NEXT:    v_mov_b32_e32 v7, v5
7554; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7555; GFX908-NEXT:    v_mov_b32_e32 v6, v4
7556; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7557; GFX908-NEXT:    s_cbranch_execnz .LBB34_4
7558; GFX908-NEXT:  ; %bb.5: ; %Flow
7559; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
7560; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
7561; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
7562; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7563; GFX908-NEXT:    s_cbranch_execz .LBB34_2
7564; GFX908-NEXT:  .LBB34_6: ; %atomicrmw.private
7565; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7566; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
7567; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
7568; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
7569; GFX908-NEXT:    s_waitcnt vmcnt(0)
7570; GFX908-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7571; GFX908-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
7572; GFX908-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
7573; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7574; GFX908-NEXT:    s_waitcnt vmcnt(0)
7575; GFX908-NEXT:    s_setpc_b64 s[30:31]
7576;
7577; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
7578; GFX8:       ; %bb.0:
7579; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7580; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
7581; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
7582; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
7583; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7584; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
7585; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
7586; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
7587; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
7588; GFX8-NEXT:    s_cbranch_execnz .LBB34_3
7589; GFX8-NEXT:  ; %bb.1: ; %Flow3
7590; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7591; GFX8-NEXT:    s_cbranch_execnz .LBB34_6
7592; GFX8-NEXT:  .LBB34_2: ; %atomicrmw.phi
7593; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7594; GFX8-NEXT:    s_setpc_b64 s[30:31]
7595; GFX8-NEXT:  .LBB34_3: ; %atomicrmw.global
7596; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
7597; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
7598; GFX8-NEXT:    flat_load_dword v7, v[4:5]
7599; GFX8-NEXT:    flat_load_dword v6, v[0:1]
7600; GFX8-NEXT:    s_mov_b64 s[6:7], 0
7601; GFX8-NEXT:  .LBB34_4: ; %atomicrmw.start
7602; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
7603; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7604; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
7605; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7606; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7607; GFX8-NEXT:    buffer_wbinvl1
7608; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7609; GFX8-NEXT:    v_mov_b32_e32 v7, v5
7610; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7611; GFX8-NEXT:    v_mov_b32_e32 v6, v4
7612; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7613; GFX8-NEXT:    s_cbranch_execnz .LBB34_4
7614; GFX8-NEXT:  ; %bb.5: ; %Flow
7615; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
7616; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
7617; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
7618; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7619; GFX8-NEXT:    s_cbranch_execz .LBB34_2
7620; GFX8-NEXT:  .LBB34_6: ; %atomicrmw.private
7621; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7622; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
7623; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
7624; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
7625; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
7626; GFX8-NEXT:    s_waitcnt vmcnt(0)
7627; GFX8-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7628; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
7629; GFX8-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
7630; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7631; GFX8-NEXT:    s_waitcnt vmcnt(0)
7632; GFX8-NEXT:    s_setpc_b64 s[30:31]
7633;
7634; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
7635; GFX7:       ; %bb.0:
7636; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7637; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
7638; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
7639; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7f8, v0
7640; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7641; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
7642; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
7643; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
7644; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
7645; GFX7-NEXT:    s_cbranch_execnz .LBB34_3
7646; GFX7-NEXT:  ; %bb.1: ; %Flow3
7647; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7648; GFX7-NEXT:    s_cbranch_execnz .LBB34_6
7649; GFX7-NEXT:  .LBB34_2: ; %atomicrmw.phi
7650; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7651; GFX7-NEXT:    s_setpc_b64 s[30:31]
7652; GFX7-NEXT:  .LBB34_3: ; %atomicrmw.global
7653; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
7654; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
7655; GFX7-NEXT:    flat_load_dword v7, v[4:5]
7656; GFX7-NEXT:    flat_load_dword v6, v[0:1]
7657; GFX7-NEXT:    s_mov_b64 s[6:7], 0
7658; GFX7-NEXT:  .LBB34_4: ; %atomicrmw.start
7659; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
7660; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7661; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
7662; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7663; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7664; GFX7-NEXT:    buffer_wbinvl1
7665; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7666; GFX7-NEXT:    v_mov_b32_e32 v7, v5
7667; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7668; GFX7-NEXT:    v_mov_b32_e32 v6, v4
7669; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7670; GFX7-NEXT:    s_cbranch_execnz .LBB34_4
7671; GFX7-NEXT:  ; %bb.5: ; %Flow
7672; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
7673; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
7674; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
7675; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7676; GFX7-NEXT:    s_cbranch_execz .LBB34_2
7677; GFX7-NEXT:  .LBB34_6: ; %atomicrmw.private
7678; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7679; GFX7-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
7680; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v4
7681; GFX7-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
7682; GFX7-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
7683; GFX7-NEXT:    s_waitcnt vmcnt(0)
7684; GFX7-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7685; GFX7-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
7686; GFX7-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
7687; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7688; GFX7-NEXT:    s_waitcnt vmcnt(0)
7689; GFX7-NEXT:    s_setpc_b64 s[30:31]
7690  %gep = getelementptr double, ptr %ptr, i64 255
7691  %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
7692  ret void
7693}
7694
7695define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
7696; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
7697; GFX12:       ; %bb.0:
7698; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7699; GFX12-NEXT:    s_wait_expcnt 0x0
7700; GFX12-NEXT:    s_wait_samplecnt 0x0
7701; GFX12-NEXT:    s_wait_bvhcnt 0x0
7702; GFX12-NEXT:    s_wait_kmcnt 0x0
7703; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
7704; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
7705; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
7706; GFX12-NEXT:    s_mov_b32 s0, exec_lo
7707; GFX12-NEXT:    s_wait_alu 0xfffe
7708; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7709; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
7710; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
7711; GFX12-NEXT:    s_cbranch_execnz .LBB35_3
7712; GFX12-NEXT:  ; %bb.1: ; %Flow3
7713; GFX12-NEXT:    s_wait_alu 0xfffe
7714; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
7715; GFX12-NEXT:    s_cbranch_execnz .LBB35_6
7716; GFX12-NEXT:  .LBB35_2: ; %atomicrmw.phi
7717; GFX12-NEXT:    s_wait_alu 0xfffe
7718; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7719; GFX12-NEXT:    s_wait_alu 0xfffe
7720; GFX12-NEXT:    s_setpc_b64 s[30:31]
7721; GFX12-NEXT:  .LBB35_3: ; %atomicrmw.global
7722; GFX12-NEXT:    flat_load_b64 v[6:7], v[0:1]
7723; GFX12-NEXT:    s_mov_b32 s1, 0
7724; GFX12-NEXT:  .LBB35_4: ; %atomicrmw.start
7725; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
7726; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7727; GFX12-NEXT:    v_add_f64_e32 v[4:5], v[6:7], v[2:3]
7728; GFX12-NEXT:    s_wait_storecnt 0x0
7729; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
7730; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7731; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7732; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
7733; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
7734; GFX12-NEXT:    s_wait_alu 0xfffe
7735; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
7736; GFX12-NEXT:    s_wait_alu 0xfffe
7737; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
7738; GFX12-NEXT:    s_cbranch_execnz .LBB35_4
7739; GFX12-NEXT:  ; %bb.5: ; %Flow
7740; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
7741; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
7742; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
7743; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
7744; GFX12-NEXT:    s_cbranch_execz .LBB35_2
7745; GFX12-NEXT:  .LBB35_6: ; %atomicrmw.private
7746; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
7747; GFX12-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
7748; GFX12-NEXT:    scratch_load_b64 v[0:1], v4, off
7749; GFX12-NEXT:    s_wait_loadcnt 0x0
7750; GFX12-NEXT:    v_add_f64_e32 v[0:1], v[0:1], v[2:3]
7751; GFX12-NEXT:    scratch_store_b64 v4, v[0:1], off
7752; GFX12-NEXT:    s_wait_alu 0xfffe
7753; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7754; GFX12-NEXT:    s_wait_alu 0xfffe
7755; GFX12-NEXT:    s_setpc_b64 s[30:31]
7756;
7757; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
7758; GFX940:       ; %bb.0:
7759; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7760; GFX940-NEXT:    s_movk_i32 s0, 0xf800
7761; GFX940-NEXT:    s_mov_b32 s1, -1
7762; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
7763; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
7764; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
7765; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
7766; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
7767; GFX940-NEXT:    s_cbranch_execnz .LBB35_3
7768; GFX940-NEXT:  ; %bb.1: ; %Flow
7769; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
7770; GFX940-NEXT:    s_cbranch_execnz .LBB35_4
7771; GFX940-NEXT:  .LBB35_2: ; %atomicrmw.phi
7772; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
7773; GFX940-NEXT:    s_setpc_b64 s[30:31]
7774; GFX940-NEXT:  .LBB35_3: ; %atomicrmw.global
7775; GFX940-NEXT:    buffer_wbl2 sc1
7776; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
7777; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7778; GFX940-NEXT:    buffer_inv sc1
7779; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
7780; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
7781; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
7782; GFX940-NEXT:    s_cbranch_execz .LBB35_2
7783; GFX940-NEXT:  .LBB35_4: ; %atomicrmw.private
7784; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7785; GFX940-NEXT:    s_nop 1
7786; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
7787; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
7788; GFX940-NEXT:    s_waitcnt vmcnt(0)
7789; GFX940-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7790; GFX940-NEXT:    scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
7791; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
7792; GFX940-NEXT:    s_waitcnt vmcnt(0)
7793; GFX940-NEXT:    s_setpc_b64 s[30:31]
7794;
7795; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
7796; GFX11:       ; %bb.0:
7797; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7798; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
7799; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
7800; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
7801; GFX11-NEXT:    s_mov_b32 s0, exec_lo
7802; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7803; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
7804; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
7805; GFX11-NEXT:    s_cbranch_execnz .LBB35_3
7806; GFX11-NEXT:  ; %bb.1: ; %Flow3
7807; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
7808; GFX11-NEXT:    s_cbranch_execnz .LBB35_6
7809; GFX11-NEXT:  .LBB35_2: ; %atomicrmw.phi
7810; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7811; GFX11-NEXT:    s_setpc_b64 s[30:31]
7812; GFX11-NEXT:  .LBB35_3: ; %atomicrmw.global
7813; GFX11-NEXT:    flat_load_b64 v[6:7], v[0:1]
7814; GFX11-NEXT:    s_mov_b32 s1, 0
7815; GFX11-NEXT:  .LBB35_4: ; %atomicrmw.start
7816; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7817; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7818; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
7819; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7820; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
7821; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7822; GFX11-NEXT:    buffer_gl1_inv
7823; GFX11-NEXT:    buffer_gl0_inv
7824; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
7825; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
7826; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
7827; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7828; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
7829; GFX11-NEXT:    s_cbranch_execnz .LBB35_4
7830; GFX11-NEXT:  ; %bb.5: ; %Flow
7831; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
7832; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
7833; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
7834; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
7835; GFX11-NEXT:    s_cbranch_execz .LBB35_2
7836; GFX11-NEXT:  .LBB35_6: ; %atomicrmw.private
7837; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
7838; GFX11-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
7839; GFX11-NEXT:    scratch_load_b64 v[0:1], v4, off
7840; GFX11-NEXT:    s_waitcnt vmcnt(0)
7841; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7842; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off
7843; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7844; GFX11-NEXT:    s_setpc_b64 s[30:31]
7845;
7846; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
7847; GFX10:       ; %bb.0:
7848; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7849; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
7850; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
7851; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
7852; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
7853; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
7854; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
7855; GFX10-NEXT:    s_cbranch_execnz .LBB35_3
7856; GFX10-NEXT:  ; %bb.1: ; %Flow3
7857; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
7858; GFX10-NEXT:    s_cbranch_execnz .LBB35_6
7859; GFX10-NEXT:  .LBB35_2: ; %atomicrmw.phi
7860; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7861; GFX10-NEXT:    s_setpc_b64 s[30:31]
7862; GFX10-NEXT:  .LBB35_3: ; %atomicrmw.global
7863; GFX10-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
7864; GFX10-NEXT:    s_mov_b32 s5, 0
7865; GFX10-NEXT:  .LBB35_4: ; %atomicrmw.start
7866; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7867; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7868; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
7869; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7870; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7871; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7872; GFX10-NEXT:    buffer_gl1_inv
7873; GFX10-NEXT:    buffer_gl0_inv
7874; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
7875; GFX10-NEXT:    v_mov_b32_e32 v7, v5
7876; GFX10-NEXT:    v_mov_b32_e32 v6, v4
7877; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
7878; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
7879; GFX10-NEXT:    s_cbranch_execnz .LBB35_4
7880; GFX10-NEXT:  ; %bb.5: ; %Flow
7881; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
7882; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
7883; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
7884; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
7885; GFX10-NEXT:    s_cbranch_execz .LBB35_2
7886; GFX10-NEXT:  .LBB35_6: ; %atomicrmw.private
7887; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
7888; GFX10-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
7889; GFX10-NEXT:    s_clause 0x1
7890; GFX10-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
7891; GFX10-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
7892; GFX10-NEXT:    s_waitcnt vmcnt(0)
7893; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7894; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
7895; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
7896; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
7897; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7898; GFX10-NEXT:    s_setpc_b64 s[30:31]
7899;
7900; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
7901; GFX90A:       ; %bb.0:
7902; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7903; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
7904; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
7905; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
7906; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
7907; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
7908; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
7909; GFX90A-NEXT:    s_cbranch_execnz .LBB35_3
7910; GFX90A-NEXT:  ; %bb.1: ; %Flow
7911; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7912; GFX90A-NEXT:    s_cbranch_execnz .LBB35_4
7913; GFX90A-NEXT:  .LBB35_2: ; %atomicrmw.phi
7914; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7915; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7916; GFX90A-NEXT:  .LBB35_3: ; %atomicrmw.global
7917; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
7918; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7919; GFX90A-NEXT:    buffer_wbinvl1
7920; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
7921; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
7922; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7923; GFX90A-NEXT:    s_cbranch_execz .LBB35_2
7924; GFX90A-NEXT:  .LBB35_4: ; %atomicrmw.private
7925; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7926; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
7927; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
7928; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
7929; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7930; GFX90A-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7931; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
7932; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
7933; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7934; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7935; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7936;
7937; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
7938; GFX908:       ; %bb.0:
7939; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7940; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
7941; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
7942; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
7943; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
7944; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
7945; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
7946; GFX908-NEXT:    s_cbranch_execnz .LBB35_3
7947; GFX908-NEXT:  ; %bb.1: ; %Flow3
7948; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7949; GFX908-NEXT:    s_cbranch_execnz .LBB35_6
7950; GFX908-NEXT:  .LBB35_2: ; %atomicrmw.phi
7951; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7952; GFX908-NEXT:    s_setpc_b64 s[30:31]
7953; GFX908-NEXT:  .LBB35_3: ; %atomicrmw.global
7954; GFX908-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
7955; GFX908-NEXT:    s_mov_b64 s[6:7], 0
7956; GFX908-NEXT:  .LBB35_4: ; %atomicrmw.start
7957; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7958; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7959; GFX908-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
7960; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7961; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7962; GFX908-NEXT:    buffer_wbinvl1
7963; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7964; GFX908-NEXT:    v_mov_b32_e32 v7, v5
7965; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7966; GFX908-NEXT:    v_mov_b32_e32 v6, v4
7967; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7968; GFX908-NEXT:    s_cbranch_execnz .LBB35_4
7969; GFX908-NEXT:  ; %bb.5: ; %Flow
7970; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
7971; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
7972; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
7973; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
7974; GFX908-NEXT:    s_cbranch_execz .LBB35_2
7975; GFX908-NEXT:  .LBB35_6: ; %atomicrmw.private
7976; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7977; GFX908-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
7978; GFX908-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
7979; GFX908-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
7980; GFX908-NEXT:    s_waitcnt vmcnt(0)
7981; GFX908-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
7982; GFX908-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
7983; GFX908-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
7984; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7985; GFX908-NEXT:    s_waitcnt vmcnt(0)
7986; GFX908-NEXT:    s_setpc_b64 s[30:31]
7987;
7988; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
7989; GFX8:       ; %bb.0:
7990; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7991; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
7992; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
7993; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
7994; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
7995; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
7996; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
7997; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
7998; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
7999; GFX8-NEXT:    s_cbranch_execnz .LBB35_3
8000; GFX8-NEXT:  ; %bb.1: ; %Flow3
8001; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
8002; GFX8-NEXT:    s_cbranch_execnz .LBB35_6
8003; GFX8-NEXT:  .LBB35_2: ; %atomicrmw.phi
8004; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8005; GFX8-NEXT:    s_setpc_b64 s[30:31]
8006; GFX8-NEXT:  .LBB35_3: ; %atomicrmw.global
8007; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
8008; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
8009; GFX8-NEXT:    flat_load_dword v7, v[4:5]
8010; GFX8-NEXT:    flat_load_dword v6, v[0:1]
8011; GFX8-NEXT:    s_mov_b64 s[6:7], 0
8012; GFX8-NEXT:  .LBB35_4: ; %atomicrmw.start
8013; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8014; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8015; GFX8-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
8016; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
8017; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8018; GFX8-NEXT:    buffer_wbinvl1
8019; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
8020; GFX8-NEXT:    v_mov_b32_e32 v7, v5
8021; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
8022; GFX8-NEXT:    v_mov_b32_e32 v6, v4
8023; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
8024; GFX8-NEXT:    s_cbranch_execnz .LBB35_4
8025; GFX8-NEXT:  ; %bb.5: ; %Flow
8026; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
8027; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
8028; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
8029; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
8030; GFX8-NEXT:    s_cbranch_execz .LBB35_2
8031; GFX8-NEXT:  .LBB35_6: ; %atomicrmw.private
8032; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
8033; GFX8-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
8034; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v4
8035; GFX8-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
8036; GFX8-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
8037; GFX8-NEXT:    s_waitcnt vmcnt(0)
8038; GFX8-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
8039; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
8040; GFX8-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
8041; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8042; GFX8-NEXT:    s_waitcnt vmcnt(0)
8043; GFX8-NEXT:    s_setpc_b64 s[30:31]
8044;
8045; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
8046; GFX7:       ; %bb.0:
8047; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8048; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
8049; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
8050; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
8051; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
8052; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
8053; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
8054; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
8055; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
8056; GFX7-NEXT:    s_cbranch_execnz .LBB35_3
8057; GFX7-NEXT:  ; %bb.1: ; %Flow3
8058; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
8059; GFX7-NEXT:    s_cbranch_execnz .LBB35_6
8060; GFX7-NEXT:  .LBB35_2: ; %atomicrmw.phi
8061; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8062; GFX7-NEXT:    s_setpc_b64 s[30:31]
8063; GFX7-NEXT:  .LBB35_3: ; %atomicrmw.global
8064; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
8065; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
8066; GFX7-NEXT:    flat_load_dword v7, v[4:5]
8067; GFX7-NEXT:    flat_load_dword v6, v[0:1]
8068; GFX7-NEXT:    s_mov_b64 s[6:7], 0
8069; GFX7-NEXT:  .LBB35_4: ; %atomicrmw.start
8070; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8071; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8072; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
8073; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
8074; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8075; GFX7-NEXT:    buffer_wbinvl1
8076; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
8077; GFX7-NEXT:    v_mov_b32_e32 v7, v5
8078; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
8079; GFX7-NEXT:    v_mov_b32_e32 v6, v4
8080; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
8081; GFX7-NEXT:    s_cbranch_execnz .LBB35_4
8082; GFX7-NEXT:  ; %bb.5: ; %Flow
8083; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
8084; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
8085; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
8086; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
8087; GFX7-NEXT:    s_cbranch_execz .LBB35_2
8088; GFX7-NEXT:  .LBB35_6: ; %atomicrmw.private
8089; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
8090; GFX7-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
8091; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v4
8092; GFX7-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
8093; GFX7-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
8094; GFX7-NEXT:    s_waitcnt vmcnt(0)
8095; GFX7-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
8096; GFX7-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
8097; GFX7-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
8098; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8099; GFX7-NEXT:    s_waitcnt vmcnt(0)
8100; GFX7-NEXT:    s_setpc_b64 s[30:31]
8101  %gep = getelementptr double, ptr %ptr, i64 -256
8102  %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
8103  ret void
8104}
8105
8106; --------------------------------------------------------------------
8107; half
8108; --------------------------------------------------------------------
8109
8110define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
8111; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
8112; GFX12:       ; %bb.0:
8113; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8114; GFX12-NEXT:    s_wait_expcnt 0x0
8115; GFX12-NEXT:    s_wait_samplecnt 0x0
8116; GFX12-NEXT:    s_wait_bvhcnt 0x0
8117; GFX12-NEXT:    s_wait_kmcnt 0x0
8118; GFX12-NEXT:    v_mov_b32_e32 v3, v0
8119; GFX12-NEXT:    s_mov_b32 s0, 0
8120; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
8121; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
8122; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
8123; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
8124; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8125; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8126; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8127; GFX12-NEXT:    v_not_b32_e32 v4, v4
8128; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
8129; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
8130; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8131; GFX12-NEXT:    v_mov_b32_e32 v6, v5
8132; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8133; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8134; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
8135; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8136; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
8137; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8138; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8139; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
8140; GFX12-NEXT:    s_wait_storecnt 0x0
8141; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
8142; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8143; GFX12-NEXT:    global_inv scope:SCOPE_DEV
8144; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8145; GFX12-NEXT:    s_wait_alu 0xfffe
8146; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
8147; GFX12-NEXT:    s_wait_alu 0xfffe
8148; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8149; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
8150; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
8151; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8152; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8153; GFX12-NEXT:    s_wait_alu 0xfffe
8154; GFX12-NEXT:    s_setpc_b64 s[30:31]
8155;
8156; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
8157; GFX940:       ; %bb.0:
8158; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8159; GFX940-NEXT:    v_mov_b32_e32 v3, v0
8160; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
8161; GFX940-NEXT:    flat_load_dword v4, v[0:1]
8162; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
8163; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8164; GFX940-NEXT:    s_mov_b32 s0, 0xffff
8165; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
8166; GFX940-NEXT:    v_not_b32_e32 v5, v5
8167; GFX940-NEXT:    s_mov_b64 s[0:1], 0
8168; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
8169; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
8170; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8171; GFX940-NEXT:    v_mov_b32_e32 v7, v4
8172; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
8173; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
8174; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8175; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
8176; GFX940-NEXT:    buffer_wbl2 sc1
8177; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
8178; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8179; GFX940-NEXT:    buffer_inv sc1
8180; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
8181; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8182; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8183; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
8184; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
8185; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
8186; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
8187; GFX940-NEXT:    s_setpc_b64 s[30:31]
8188;
8189; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
8190; GFX11:       ; %bb.0:
8191; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8192; GFX11-NEXT:    v_mov_b32_e32 v3, v0
8193; GFX11-NEXT:    s_mov_b32 s0, 0
8194; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
8195; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
8196; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
8197; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
8198; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8199; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8200; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8201; GFX11-NEXT:    v_not_b32_e32 v4, v4
8202; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
8203; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
8204; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8205; GFX11-NEXT:    v_mov_b32_e32 v6, v5
8206; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8207; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8208; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
8209; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8210; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
8211; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8212; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8213; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
8214; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
8215; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
8216; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8217; GFX11-NEXT:    buffer_gl1_inv
8218; GFX11-NEXT:    buffer_gl0_inv
8219; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8220; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
8221; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8222; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8223; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
8224; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
8225; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8226; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8227; GFX11-NEXT:    s_setpc_b64 s[30:31]
8228;
8229; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
8230; GFX10:       ; %bb.0:
8231; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8232; GFX10-NEXT:    v_mov_b32_e32 v3, v0
8233; GFX10-NEXT:    s_mov_b32 s4, 0
8234; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
8235; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
8236; GFX10-NEXT:    flat_load_dword v5, v[0:1]
8237; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8238; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8239; GFX10-NEXT:    v_not_b32_e32 v4, v4
8240; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
8241; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
8242; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8243; GFX10-NEXT:    v_mov_b32_e32 v6, v5
8244; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8245; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
8246; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
8247; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
8248; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
8249; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8250; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8251; GFX10-NEXT:    buffer_gl1_inv
8252; GFX10-NEXT:    buffer_gl0_inv
8253; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8254; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
8255; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
8256; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
8257; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
8258; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
8259; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8260; GFX10-NEXT:    s_setpc_b64 s[30:31]
8261;
8262; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
8263; GFX90A:       ; %bb.0:
8264; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8265; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
8266; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
8267; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
8268; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
8269; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8270; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
8271; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
8272; GFX90A-NEXT:    v_not_b32_e32 v5, v5
8273; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
8274; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
8275; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
8276; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8277; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
8278; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
8279; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
8280; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8281; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
8282; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
8283; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8284; GFX90A-NEXT:    buffer_wbinvl1
8285; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
8286; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8287; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8288; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
8289; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
8290; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
8291; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
8292; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8293;
8294; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
8295; GFX908:       ; %bb.0:
8296; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8297; GFX908-NEXT:    v_mov_b32_e32 v3, v0
8298; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
8299; GFX908-NEXT:    flat_load_dword v4, v[0:1]
8300; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
8301; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8302; GFX908-NEXT:    s_mov_b32 s4, 0xffff
8303; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
8304; GFX908-NEXT:    v_not_b32_e32 v5, v5
8305; GFX908-NEXT:    s_mov_b64 s[4:5], 0
8306; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
8307; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
8308; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8309; GFX908-NEXT:    v_mov_b32_e32 v7, v4
8310; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
8311; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
8312; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8313; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
8314; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
8315; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8316; GFX908-NEXT:    buffer_wbinvl1
8317; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
8318; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8319; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8320; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
8321; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
8322; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
8323; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
8324; GFX908-NEXT:    s_setpc_b64 s[30:31]
8325;
8326; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
8327; GFX8:       ; %bb.0:
8328; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8329; GFX8-NEXT:    v_mov_b32_e32 v3, v0
8330; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
8331; GFX8-NEXT:    flat_load_dword v5, v[0:1]
8332; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
8333; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8334; GFX8-NEXT:    s_mov_b32 s4, 0xffff
8335; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
8336; GFX8-NEXT:    v_not_b32_e32 v4, v4
8337; GFX8-NEXT:    s_mov_b64 s[4:5], 0
8338; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
8339; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8340; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8341; GFX8-NEXT:    v_mov_b32_e32 v6, v5
8342; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8343; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
8344; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
8345; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8346; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
8347; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8348; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8349; GFX8-NEXT:    buffer_wbinvl1
8350; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8351; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8352; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8353; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
8354; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8355; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8356; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8357; GFX8-NEXT:    s_setpc_b64 s[30:31]
8358;
8359; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
8360; GFX7:       ; %bb.0:
8361; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8362; GFX7-NEXT:    v_mov_b32_e32 v3, v0
8363; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
8364; GFX7-NEXT:    flat_load_dword v5, v[0:1]
8365; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
8366; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
8367; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
8368; GFX7-NEXT:    s_mov_b64 s[4:5], 0
8369; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
8370; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
8371; GFX7-NEXT:    v_not_b32_e32 v4, v4
8372; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
8373; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8374; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8375; GFX7-NEXT:    v_mov_b32_e32 v6, v5
8376; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
8377; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
8378; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
8379; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
8380; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
8381; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
8382; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
8383; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8384; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8385; GFX7-NEXT:    buffer_wbinvl1
8386; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8387; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8388; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8389; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
8390; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
8391; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8392; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
8393; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
8394; GFX7-NEXT:    s_setpc_b64 s[30:31]
8395  %result = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
8396  ret half %result
8397}
8398
8399define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
8400; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8401; GFX12:       ; %bb.0:
8402; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8403; GFX12-NEXT:    s_wait_expcnt 0x0
8404; GFX12-NEXT:    s_wait_samplecnt 0x0
8405; GFX12-NEXT:    s_wait_bvhcnt 0x0
8406; GFX12-NEXT:    s_wait_kmcnt 0x0
8407; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
8408; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8409; GFX12-NEXT:    s_mov_b32 s0, 0
8410; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
8411; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
8412; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
8413; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
8414; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8415; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8416; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8417; GFX12-NEXT:    v_not_b32_e32 v4, v4
8418; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
8419; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
8420; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8421; GFX12-NEXT:    v_mov_b32_e32 v6, v5
8422; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8423; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8424; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
8425; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8426; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
8427; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8428; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8429; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
8430; GFX12-NEXT:    s_wait_storecnt 0x0
8431; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
8432; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8433; GFX12-NEXT:    global_inv scope:SCOPE_DEV
8434; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8435; GFX12-NEXT:    s_wait_alu 0xfffe
8436; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
8437; GFX12-NEXT:    s_wait_alu 0xfffe
8438; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8439; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
8440; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
8441; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8442; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8443; GFX12-NEXT:    s_wait_alu 0xfffe
8444; GFX12-NEXT:    s_setpc_b64 s[30:31]
8445;
8446; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8447; GFX940:       ; %bb.0:
8448; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8449; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
8450; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
8451; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
8452; GFX940-NEXT:    v_mov_b32_e32 v1, v7
8453; GFX940-NEXT:    flat_load_dword v4, v[0:1]
8454; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
8455; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8456; GFX940-NEXT:    s_mov_b32 s0, 0xffff
8457; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
8458; GFX940-NEXT:    v_not_b32_e32 v5, v5
8459; GFX940-NEXT:    s_mov_b64 s[0:1], 0
8460; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
8461; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
8462; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8463; GFX940-NEXT:    v_mov_b32_e32 v7, v4
8464; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
8465; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
8466; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8467; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
8468; GFX940-NEXT:    buffer_wbl2 sc1
8469; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
8470; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8471; GFX940-NEXT:    buffer_inv sc1
8472; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
8473; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8474; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8475; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
8476; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
8477; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
8478; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
8479; GFX940-NEXT:    s_setpc_b64 s[30:31]
8480;
8481; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8482; GFX11:       ; %bb.0:
8483; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8484; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
8485; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8486; GFX11-NEXT:    s_mov_b32 s0, 0
8487; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
8488; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
8489; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
8490; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
8491; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8492; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8493; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8494; GFX11-NEXT:    v_not_b32_e32 v4, v4
8495; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
8496; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
8497; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8498; GFX11-NEXT:    v_mov_b32_e32 v6, v5
8499; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8500; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8501; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
8502; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8503; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
8504; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8505; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8506; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
8507; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
8508; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
8509; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8510; GFX11-NEXT:    buffer_gl1_inv
8511; GFX11-NEXT:    buffer_gl0_inv
8512; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8513; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
8514; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8515; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8516; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
8517; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
8518; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8519; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8520; GFX11-NEXT:    s_setpc_b64 s[30:31]
8521;
8522; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8523; GFX10:       ; %bb.0:
8524; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8525; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
8526; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8527; GFX10-NEXT:    s_mov_b32 s4, 0
8528; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
8529; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
8530; GFX10-NEXT:    flat_load_dword v5, v[0:1]
8531; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8532; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8533; GFX10-NEXT:    v_not_b32_e32 v4, v4
8534; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
8535; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
8536; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8537; GFX10-NEXT:    v_mov_b32_e32 v6, v5
8538; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8539; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
8540; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
8541; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
8542; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
8543; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8544; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8545; GFX10-NEXT:    buffer_gl1_inv
8546; GFX10-NEXT:    buffer_gl0_inv
8547; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8548; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
8549; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
8550; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
8551; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
8552; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
8553; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8554; GFX10-NEXT:    s_setpc_b64 s[30:31]
8555;
8556; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8557; GFX90A:       ; %bb.0:
8558; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8559; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
8560; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8561; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
8562; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
8563; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
8564; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8565; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
8566; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
8567; GFX90A-NEXT:    v_not_b32_e32 v5, v5
8568; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
8569; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
8570; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
8571; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8572; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
8573; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
8574; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
8575; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8576; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
8577; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
8578; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8579; GFX90A-NEXT:    buffer_wbinvl1
8580; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
8581; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8582; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8583; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
8584; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
8585; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
8586; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
8587; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8588;
8589; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8590; GFX908:       ; %bb.0:
8591; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8592; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
8593; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8594; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
8595; GFX908-NEXT:    flat_load_dword v4, v[0:1]
8596; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
8597; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8598; GFX908-NEXT:    s_mov_b32 s4, 0xffff
8599; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
8600; GFX908-NEXT:    v_not_b32_e32 v5, v5
8601; GFX908-NEXT:    s_mov_b64 s[4:5], 0
8602; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
8603; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
8604; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8605; GFX908-NEXT:    v_mov_b32_e32 v7, v4
8606; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
8607; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
8608; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8609; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
8610; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
8611; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8612; GFX908-NEXT:    buffer_wbinvl1
8613; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
8614; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8615; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8616; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
8617; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
8618; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
8619; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
8620; GFX908-NEXT:    s_setpc_b64 s[30:31]
8621;
8622; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8623; GFX8:       ; %bb.0:
8624; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8625; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
8626; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8627; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
8628; GFX8-NEXT:    flat_load_dword v5, v[0:1]
8629; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
8630; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8631; GFX8-NEXT:    s_mov_b32 s4, 0xffff
8632; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
8633; GFX8-NEXT:    v_not_b32_e32 v4, v4
8634; GFX8-NEXT:    s_mov_b64 s[4:5], 0
8635; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
8636; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8637; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8638; GFX8-NEXT:    v_mov_b32_e32 v6, v5
8639; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8640; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
8641; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
8642; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8643; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
8644; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8645; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8646; GFX8-NEXT:    buffer_wbinvl1
8647; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8648; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8649; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8650; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
8651; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8652; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8653; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8654; GFX8-NEXT:    s_setpc_b64 s[30:31]
8655;
8656; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8657; GFX7:       ; %bb.0:
8658; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8659; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
8660; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8661; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
8662; GFX7-NEXT:    flat_load_dword v5, v[0:1]
8663; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
8664; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
8665; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
8666; GFX7-NEXT:    s_mov_b64 s[4:5], 0
8667; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
8668; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
8669; GFX7-NEXT:    v_not_b32_e32 v4, v4
8670; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
8671; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8672; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8673; GFX7-NEXT:    v_mov_b32_e32 v6, v5
8674; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
8675; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
8676; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
8677; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
8678; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
8679; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
8680; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
8681; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8682; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8683; GFX7-NEXT:    buffer_wbinvl1
8684; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8685; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8686; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8687; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
8688; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
8689; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8690; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
8691; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
8692; GFX7-NEXT:    s_setpc_b64 s[30:31]
8693  %gep = getelementptr half, ptr %ptr, i64 1023
8694  %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
8695  ret half %result
8696}
8697
8698define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
8699; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
8700; GFX12:       ; %bb.0:
8701; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8702; GFX12-NEXT:    s_wait_expcnt 0x0
8703; GFX12-NEXT:    s_wait_samplecnt 0x0
8704; GFX12-NEXT:    s_wait_bvhcnt 0x0
8705; GFX12-NEXT:    s_wait_kmcnt 0x0
8706; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
8707; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
8708; GFX12-NEXT:    s_mov_b32 s0, 0
8709; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
8710; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
8711; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
8712; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
8713; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8714; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8715; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8716; GFX12-NEXT:    v_not_b32_e32 v4, v4
8717; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
8718; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
8719; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8720; GFX12-NEXT:    v_mov_b32_e32 v6, v5
8721; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8722; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8723; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
8724; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8725; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
8726; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8727; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8728; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
8729; GFX12-NEXT:    s_wait_storecnt 0x0
8730; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
8731; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8732; GFX12-NEXT:    global_inv scope:SCOPE_DEV
8733; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8734; GFX12-NEXT:    s_wait_alu 0xfffe
8735; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
8736; GFX12-NEXT:    s_wait_alu 0xfffe
8737; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8738; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
8739; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
8740; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8741; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8742; GFX12-NEXT:    s_wait_alu 0xfffe
8743; GFX12-NEXT:    s_setpc_b64 s[30:31]
8744;
8745; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
8746; GFX940:       ; %bb.0:
8747; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8748; GFX940-NEXT:    s_movk_i32 s0, 0xf800
8749; GFX940-NEXT:    s_mov_b32 s1, -1
8750; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
8751; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
8752; GFX940-NEXT:    v_mov_b32_e32 v1, v7
8753; GFX940-NEXT:    flat_load_dword v4, v[0:1]
8754; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
8755; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8756; GFX940-NEXT:    s_mov_b32 s0, 0xffff
8757; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
8758; GFX940-NEXT:    v_not_b32_e32 v5, v5
8759; GFX940-NEXT:    s_mov_b64 s[0:1], 0
8760; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
8761; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
8762; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8763; GFX940-NEXT:    v_mov_b32_e32 v7, v4
8764; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
8765; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
8766; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8767; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
8768; GFX940-NEXT:    buffer_wbl2 sc1
8769; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
8770; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8771; GFX940-NEXT:    buffer_inv sc1
8772; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
8773; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8774; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8775; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
8776; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
8777; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
8778; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
8779; GFX940-NEXT:    s_setpc_b64 s[30:31]
8780;
8781; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
8782; GFX11:       ; %bb.0:
8783; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8784; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
8785; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
8786; GFX11-NEXT:    s_mov_b32 s0, 0
8787; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
8788; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
8789; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
8790; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
8791; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8792; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8793; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8794; GFX11-NEXT:    v_not_b32_e32 v4, v4
8795; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
8796; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
8797; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8798; GFX11-NEXT:    v_mov_b32_e32 v6, v5
8799; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8800; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8801; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
8802; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8803; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
8804; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8805; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8806; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
8807; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
8808; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
8809; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8810; GFX11-NEXT:    buffer_gl1_inv
8811; GFX11-NEXT:    buffer_gl0_inv
8812; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8813; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
8814; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8815; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8816; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
8817; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
8818; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8819; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8820; GFX11-NEXT:    s_setpc_b64 s[30:31]
8821;
8822; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
8823; GFX10:       ; %bb.0:
8824; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8825; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
8826; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
8827; GFX10-NEXT:    s_mov_b32 s4, 0
8828; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
8829; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
8830; GFX10-NEXT:    flat_load_dword v5, v[0:1]
8831; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8832; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8833; GFX10-NEXT:    v_not_b32_e32 v4, v4
8834; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
8835; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
8836; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8837; GFX10-NEXT:    v_mov_b32_e32 v6, v5
8838; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8839; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
8840; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
8841; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
8842; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
8843; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8844; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8845; GFX10-NEXT:    buffer_gl1_inv
8846; GFX10-NEXT:    buffer_gl0_inv
8847; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8848; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
8849; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
8850; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
8851; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
8852; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
8853; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8854; GFX10-NEXT:    s_setpc_b64 s[30:31]
8855;
8856; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
8857; GFX90A:       ; %bb.0:
8858; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8859; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
8860; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
8861; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
8862; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
8863; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
8864; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8865; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
8866; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
8867; GFX90A-NEXT:    v_not_b32_e32 v5, v5
8868; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
8869; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
8870; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
8871; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8872; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
8873; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
8874; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
8875; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8876; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
8877; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
8878; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8879; GFX90A-NEXT:    buffer_wbinvl1
8880; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
8881; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8882; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8883; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
8884; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
8885; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
8886; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
8887; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8888;
8889; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
8890; GFX908:       ; %bb.0:
8891; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8892; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
8893; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
8894; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
8895; GFX908-NEXT:    flat_load_dword v4, v[0:1]
8896; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
8897; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8898; GFX908-NEXT:    s_mov_b32 s4, 0xffff
8899; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
8900; GFX908-NEXT:    v_not_b32_e32 v5, v5
8901; GFX908-NEXT:    s_mov_b64 s[4:5], 0
8902; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
8903; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
8904; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8905; GFX908-NEXT:    v_mov_b32_e32 v7, v4
8906; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
8907; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
8908; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
8909; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
8910; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
8911; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8912; GFX908-NEXT:    buffer_wbinvl1
8913; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
8914; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8915; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8916; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
8917; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
8918; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
8919; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
8920; GFX908-NEXT:    s_setpc_b64 s[30:31]
8921;
8922; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
8923; GFX8:       ; %bb.0:
8924; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8925; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
8926; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
8927; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
8928; GFX8-NEXT:    flat_load_dword v5, v[0:1]
8929; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
8930; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8931; GFX8-NEXT:    s_mov_b32 s4, 0xffff
8932; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
8933; GFX8-NEXT:    v_not_b32_e32 v4, v4
8934; GFX8-NEXT:    s_mov_b64 s[4:5], 0
8935; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
8936; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8937; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8938; GFX8-NEXT:    v_mov_b32_e32 v6, v5
8939; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8940; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
8941; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
8942; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8943; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
8944; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8945; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8946; GFX8-NEXT:    buffer_wbinvl1
8947; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8948; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8949; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8950; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
8951; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8952; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8953; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8954; GFX8-NEXT:    s_setpc_b64 s[30:31]
8955;
8956; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
8957; GFX7:       ; %bb.0:
8958; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8959; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
8960; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
8961; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
8962; GFX7-NEXT:    flat_load_dword v5, v[0:1]
8963; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
8964; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
8965; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
8966; GFX7-NEXT:    s_mov_b64 s[4:5], 0
8967; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
8968; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
8969; GFX7-NEXT:    v_not_b32_e32 v4, v4
8970; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
8971; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8972; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8973; GFX7-NEXT:    v_mov_b32_e32 v6, v5
8974; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
8975; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
8976; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
8977; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
8978; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
8979; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
8980; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
8981; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8982; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8983; GFX7-NEXT:    buffer_wbinvl1
8984; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8985; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8986; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8987; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
8988; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
8989; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8990; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
8991; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
8992; GFX7-NEXT:    s_setpc_b64 s[30:31]
8993  %gep = getelementptr half, ptr %ptr, i64 -1024
8994  %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
8995  ret half %result
8996 }
8997
8998define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
8999; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
9000; GFX12:       ; %bb.0:
9001; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9002; GFX12-NEXT:    s_wait_expcnt 0x0
9003; GFX12-NEXT:    s_wait_samplecnt 0x0
9004; GFX12-NEXT:    s_wait_bvhcnt 0x0
9005; GFX12-NEXT:    s_wait_kmcnt 0x0
9006; GFX12-NEXT:    v_mov_b32_e32 v3, v0
9007; GFX12-NEXT:    s_mov_b32 s0, 0
9008; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
9009; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
9010; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
9011; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
9012; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9013; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
9014; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9015; GFX12-NEXT:    v_not_b32_e32 v6, v3
9016; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
9017; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
9018; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9019; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9020; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9021; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
9022; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
9023; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9024; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9025; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
9026; GFX12-NEXT:    s_wait_storecnt 0x0
9027; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
9028; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9029; GFX12-NEXT:    global_inv scope:SCOPE_DEV
9030; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9031; GFX12-NEXT:    v_mov_b32_e32 v4, v3
9032; GFX12-NEXT:    s_wait_alu 0xfffe
9033; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
9034; GFX12-NEXT:    s_wait_alu 0xfffe
9035; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9036; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
9037; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
9038; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9039; GFX12-NEXT:    s_wait_alu 0xfffe
9040; GFX12-NEXT:    s_setpc_b64 s[30:31]
9041;
9042; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
9043; GFX940:       ; %bb.0:
9044; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9045; GFX940-NEXT:    v_mov_b32_e32 v3, v0
9046; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
9047; GFX940-NEXT:    flat_load_dword v5, v[0:1]
9048; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
9049; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9050; GFX940-NEXT:    s_mov_b32 s0, 0xffff
9051; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
9052; GFX940-NEXT:    v_not_b32_e32 v6, v4
9053; GFX940-NEXT:    s_mov_b64 s[0:1], 0
9054; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
9055; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
9056; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9057; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
9058; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
9059; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
9060; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
9061; GFX940-NEXT:    buffer_wbl2 sc1
9062; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
9063; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9064; GFX940-NEXT:    buffer_inv sc1
9065; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
9066; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
9067; GFX940-NEXT:    v_mov_b32_e32 v5, v4
9068; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
9069; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
9070; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
9071; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
9072; GFX940-NEXT:    s_setpc_b64 s[30:31]
9073;
9074; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
9075; GFX11:       ; %bb.0:
9076; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9077; GFX11-NEXT:    v_mov_b32_e32 v3, v0
9078; GFX11-NEXT:    s_mov_b32 s0, 0
9079; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
9080; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
9081; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
9082; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
9083; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9084; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
9085; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9086; GFX11-NEXT:    v_not_b32_e32 v6, v3
9087; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
9088; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
9089; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9090; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9091; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9092; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
9093; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
9094; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9095; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9096; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
9097; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
9098; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
9099; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9100; GFX11-NEXT:    buffer_gl1_inv
9101; GFX11-NEXT:    buffer_gl0_inv
9102; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9103; GFX11-NEXT:    v_mov_b32_e32 v4, v3
9104; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
9105; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9106; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9107; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
9108; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
9109; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9110; GFX11-NEXT:    s_setpc_b64 s[30:31]
9111;
9112; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
9113; GFX10:       ; %bb.0:
9114; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9115; GFX10-NEXT:    v_mov_b32_e32 v3, v0
9116; GFX10-NEXT:    s_mov_b32 s4, 0
9117; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
9118; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
9119; GFX10-NEXT:    flat_load_dword v4, v[0:1]
9120; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9121; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
9122; GFX10-NEXT:    v_not_b32_e32 v6, v3
9123; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
9124; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
9125; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9126; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9127; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
9128; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
9129; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
9130; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
9131; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9132; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9133; GFX10-NEXT:    buffer_gl1_inv
9134; GFX10-NEXT:    buffer_gl0_inv
9135; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9136; GFX10-NEXT:    v_mov_b32_e32 v4, v3
9137; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
9138; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
9139; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
9140; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
9141; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
9142; GFX10-NEXT:    s_setpc_b64 s[30:31]
9143;
9144; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
9145; GFX90A:       ; %bb.0:
9146; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9147; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
9148; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
9149; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
9150; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
9151; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9152; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
9153; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9154; GFX90A-NEXT:    v_not_b32_e32 v6, v4
9155; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
9156; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
9157; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
9158; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9159; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
9160; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
9161; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
9162; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
9163; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
9164; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9165; GFX90A-NEXT:    buffer_wbinvl1
9166; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
9167; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9168; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
9169; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9170; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
9171; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
9172; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
9173; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9174;
9175; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
9176; GFX908:       ; %bb.0:
9177; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9178; GFX908-NEXT:    v_mov_b32_e32 v3, v0
9179; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
9180; GFX908-NEXT:    flat_load_dword v4, v[0:1]
9181; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
9182; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9183; GFX908-NEXT:    s_mov_b32 s4, 0xffff
9184; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
9185; GFX908-NEXT:    v_not_b32_e32 v6, v3
9186; GFX908-NEXT:    s_mov_b64 s[4:5], 0
9187; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
9188; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
9189; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9190; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9191; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
9192; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9193; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
9194; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9195; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9196; GFX908-NEXT:    buffer_wbinvl1
9197; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
9198; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9199; GFX908-NEXT:    v_mov_b32_e32 v4, v3
9200; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9201; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
9202; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
9203; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
9204; GFX908-NEXT:    s_setpc_b64 s[30:31]
9205;
9206; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
9207; GFX8:       ; %bb.0:
9208; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9209; GFX8-NEXT:    v_mov_b32_e32 v3, v0
9210; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
9211; GFX8-NEXT:    flat_load_dword v4, v[0:1]
9212; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
9213; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9214; GFX8-NEXT:    s_mov_b32 s4, 0xffff
9215; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
9216; GFX8-NEXT:    v_not_b32_e32 v6, v3
9217; GFX8-NEXT:    s_mov_b64 s[4:5], 0
9218; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
9219; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
9220; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9221; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9222; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
9223; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
9224; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9225; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
9226; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9227; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9228; GFX8-NEXT:    buffer_wbinvl1
9229; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
9230; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9231; GFX8-NEXT:    v_mov_b32_e32 v4, v3
9232; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9233; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
9234; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
9235; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
9236; GFX8-NEXT:    s_setpc_b64 s[30:31]
9237;
9238; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
9239; GFX7:       ; %bb.0:
9240; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9241; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9242; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
9243; GFX7-NEXT:    flat_load_dword v4, v[0:1]
9244; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
9245; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
9246; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
9247; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
9248; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
9249; GFX7-NEXT:    v_not_b32_e32 v6, v3
9250; GFX7-NEXT:    s_mov_b64 s[4:5], 0
9251; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
9252; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
9253; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9254; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
9255; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
9256; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
9257; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
9258; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
9259; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
9260; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
9261; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9262; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9263; GFX7-NEXT:    buffer_wbinvl1
9264; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
9265; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9266; GFX7-NEXT:    v_mov_b32_e32 v4, v3
9267; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9268; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
9269; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
9270; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
9271; GFX7-NEXT:    s_setpc_b64 s[30:31]
9272  %unused = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
9273  ret void
9274}
9275
9276define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
9277; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
9278; GFX12:       ; %bb.0:
9279; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9280; GFX12-NEXT:    s_wait_expcnt 0x0
9281; GFX12-NEXT:    s_wait_samplecnt 0x0
9282; GFX12-NEXT:    s_wait_bvhcnt 0x0
9283; GFX12-NEXT:    s_wait_kmcnt 0x0
9284; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
9285; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
9286; GFX12-NEXT:    s_mov_b32 s0, 0
9287; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
9288; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
9289; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
9290; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
9291; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9292; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
9293; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9294; GFX12-NEXT:    v_not_b32_e32 v6, v3
9295; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
9296; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
9297; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9298; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9299; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9300; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
9301; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
9302; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9303; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9304; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
9305; GFX12-NEXT:    s_wait_storecnt 0x0
9306; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
9307; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9308; GFX12-NEXT:    global_inv scope:SCOPE_DEV
9309; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9310; GFX12-NEXT:    v_mov_b32_e32 v4, v3
9311; GFX12-NEXT:    s_wait_alu 0xfffe
9312; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
9313; GFX12-NEXT:    s_wait_alu 0xfffe
9314; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9315; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
9316; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
9317; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9318; GFX12-NEXT:    s_wait_alu 0xfffe
9319; GFX12-NEXT:    s_setpc_b64 s[30:31]
9320;
9321; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
9322; GFX940:       ; %bb.0:
9323; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9324; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
9325; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
9326; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
9327; GFX940-NEXT:    v_mov_b32_e32 v1, v5
9328; GFX940-NEXT:    flat_load_dword v5, v[0:1]
9329; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
9330; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9331; GFX940-NEXT:    s_mov_b32 s0, 0xffff
9332; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
9333; GFX940-NEXT:    v_not_b32_e32 v6, v4
9334; GFX940-NEXT:    s_mov_b64 s[0:1], 0
9335; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
9336; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
9337; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9338; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
9339; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
9340; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
9341; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
9342; GFX940-NEXT:    buffer_wbl2 sc1
9343; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
9344; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9345; GFX940-NEXT:    buffer_inv sc1
9346; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
9347; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
9348; GFX940-NEXT:    v_mov_b32_e32 v5, v4
9349; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
9350; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
9351; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
9352; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
9353; GFX940-NEXT:    s_setpc_b64 s[30:31]
9354;
9355; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
9356; GFX11:       ; %bb.0:
9357; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9358; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
9359; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
9360; GFX11-NEXT:    s_mov_b32 s0, 0
9361; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
9362; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
9363; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
9364; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
9365; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9366; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
9367; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9368; GFX11-NEXT:    v_not_b32_e32 v6, v3
9369; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
9370; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
9371; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9372; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9373; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9374; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
9375; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
9376; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9377; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9378; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
9379; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
9380; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
9381; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9382; GFX11-NEXT:    buffer_gl1_inv
9383; GFX11-NEXT:    buffer_gl0_inv
9384; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9385; GFX11-NEXT:    v_mov_b32_e32 v4, v3
9386; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
9387; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9388; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9389; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
9390; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
9391; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9392; GFX11-NEXT:    s_setpc_b64 s[30:31]
9393;
9394; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
9395; GFX10:       ; %bb.0:
9396; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9397; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
9398; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
9399; GFX10-NEXT:    s_mov_b32 s4, 0
9400; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
9401; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
9402; GFX10-NEXT:    flat_load_dword v4, v[0:1]
9403; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9404; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
9405; GFX10-NEXT:    v_not_b32_e32 v6, v3
9406; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
9407; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
9408; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9409; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9410; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
9411; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
9412; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
9413; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
9414; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9415; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9416; GFX10-NEXT:    buffer_gl1_inv
9417; GFX10-NEXT:    buffer_gl0_inv
9418; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9419; GFX10-NEXT:    v_mov_b32_e32 v4, v3
9420; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
9421; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
9422; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
9423; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
9424; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
9425; GFX10-NEXT:    s_setpc_b64 s[30:31]
9426;
9427; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
9428; GFX90A:       ; %bb.0:
9429; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9430; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
9431; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9432; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
9433; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
9434; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
9435; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9436; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
9437; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9438; GFX90A-NEXT:    v_not_b32_e32 v6, v4
9439; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
9440; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
9441; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
9442; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9443; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
9444; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
9445; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
9446; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
9447; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
9448; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9449; GFX90A-NEXT:    buffer_wbinvl1
9450; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
9451; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9452; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
9453; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9454; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
9455; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
9456; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
9457; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9458;
9459; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
9460; GFX908:       ; %bb.0:
9461; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9462; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
9463; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9464; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
9465; GFX908-NEXT:    flat_load_dword v4, v[0:1]
9466; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
9467; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9468; GFX908-NEXT:    s_mov_b32 s4, 0xffff
9469; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
9470; GFX908-NEXT:    v_not_b32_e32 v6, v3
9471; GFX908-NEXT:    s_mov_b64 s[4:5], 0
9472; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
9473; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
9474; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9475; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9476; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
9477; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9478; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
9479; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9480; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9481; GFX908-NEXT:    buffer_wbinvl1
9482; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
9483; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9484; GFX908-NEXT:    v_mov_b32_e32 v4, v3
9485; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9486; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
9487; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
9488; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
9489; GFX908-NEXT:    s_setpc_b64 s[30:31]
9490;
9491; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
9492; GFX8:       ; %bb.0:
9493; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9494; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
9495; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9496; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
9497; GFX8-NEXT:    flat_load_dword v4, v[0:1]
9498; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
9499; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9500; GFX8-NEXT:    s_mov_b32 s4, 0xffff
9501; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
9502; GFX8-NEXT:    v_not_b32_e32 v6, v3
9503; GFX8-NEXT:    s_mov_b64 s[4:5], 0
9504; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
9505; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
9506; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9507; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9508; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
9509; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
9510; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9511; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
9512; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9513; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9514; GFX8-NEXT:    buffer_wbinvl1
9515; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
9516; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9517; GFX8-NEXT:    v_mov_b32_e32 v4, v3
9518; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9519; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
9520; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
9521; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
9522; GFX8-NEXT:    s_setpc_b64 s[30:31]
9523;
9524; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
9525; GFX7:       ; %bb.0:
9526; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9527; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
9528; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9529; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
9530; GFX7-NEXT:    flat_load_dword v3, v[0:1]
9531; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
9532; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
9533; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
9534; GFX7-NEXT:    s_mov_b64 s[4:5], 0
9535; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
9536; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
9537; GFX7-NEXT:    v_not_b32_e32 v6, v2
9538; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
9539; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
9540; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9541; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
9542; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
9543; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
9544; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
9545; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
9546; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
9547; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
9548; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9549; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9550; GFX7-NEXT:    buffer_wbinvl1
9551; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
9552; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9553; GFX7-NEXT:    v_mov_b32_e32 v3, v2
9554; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9555; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
9556; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
9557; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
9558; GFX7-NEXT:    s_setpc_b64 s[30:31]
9559  %gep = getelementptr half, ptr %ptr, i64 1023
9560  %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
9561  ret void
9562}
9563
9564define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
9565; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
9566; GFX12:       ; %bb.0:
9567; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9568; GFX12-NEXT:    s_wait_expcnt 0x0
9569; GFX12-NEXT:    s_wait_samplecnt 0x0
9570; GFX12-NEXT:    s_wait_bvhcnt 0x0
9571; GFX12-NEXT:    s_wait_kmcnt 0x0
9572; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
9573; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
9574; GFX12-NEXT:    s_mov_b32 s0, 0
9575; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
9576; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
9577; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
9578; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
9579; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9580; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
9581; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9582; GFX12-NEXT:    v_not_b32_e32 v6, v3
9583; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
9584; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
9585; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9586; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9587; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9588; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
9589; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
9590; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9591; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9592; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
9593; GFX12-NEXT:    s_wait_storecnt 0x0
9594; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
9595; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9596; GFX12-NEXT:    global_inv scope:SCOPE_DEV
9597; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9598; GFX12-NEXT:    v_mov_b32_e32 v4, v3
9599; GFX12-NEXT:    s_wait_alu 0xfffe
9600; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
9601; GFX12-NEXT:    s_wait_alu 0xfffe
9602; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9603; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
9604; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
9605; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9606; GFX12-NEXT:    s_wait_alu 0xfffe
9607; GFX12-NEXT:    s_setpc_b64 s[30:31]
9608;
9609; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
9610; GFX940:       ; %bb.0:
9611; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9612; GFX940-NEXT:    s_movk_i32 s0, 0xf800
9613; GFX940-NEXT:    s_mov_b32 s1, -1
9614; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
9615; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
9616; GFX940-NEXT:    v_mov_b32_e32 v1, v5
9617; GFX940-NEXT:    flat_load_dword v5, v[0:1]
9618; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
9619; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9620; GFX940-NEXT:    s_mov_b32 s0, 0xffff
9621; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
9622; GFX940-NEXT:    v_not_b32_e32 v6, v4
9623; GFX940-NEXT:    s_mov_b64 s[0:1], 0
9624; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
9625; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
9626; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9627; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
9628; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
9629; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
9630; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
9631; GFX940-NEXT:    buffer_wbl2 sc1
9632; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
9633; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9634; GFX940-NEXT:    buffer_inv sc1
9635; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
9636; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
9637; GFX940-NEXT:    v_mov_b32_e32 v5, v4
9638; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
9639; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
9640; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
9641; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
9642; GFX940-NEXT:    s_setpc_b64 s[30:31]
9643;
9644; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
9645; GFX11:       ; %bb.0:
9646; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9647; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
9648; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
9649; GFX11-NEXT:    s_mov_b32 s0, 0
9650; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
9651; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
9652; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
9653; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
9654; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9655; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
9656; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9657; GFX11-NEXT:    v_not_b32_e32 v6, v3
9658; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
9659; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
9660; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9661; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9662; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9663; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
9664; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
9665; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9666; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9667; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
9668; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
9669; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
9670; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9671; GFX11-NEXT:    buffer_gl1_inv
9672; GFX11-NEXT:    buffer_gl0_inv
9673; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9674; GFX11-NEXT:    v_mov_b32_e32 v4, v3
9675; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
9676; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9677; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9678; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
9679; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
9680; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9681; GFX11-NEXT:    s_setpc_b64 s[30:31]
9682;
9683; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
9684; GFX10:       ; %bb.0:
9685; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9686; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
9687; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
9688; GFX10-NEXT:    s_mov_b32 s4, 0
9689; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
9690; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
9691; GFX10-NEXT:    flat_load_dword v4, v[0:1]
9692; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9693; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
9694; GFX10-NEXT:    v_not_b32_e32 v6, v3
9695; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
9696; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
9697; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9698; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9699; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
9700; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
9701; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
9702; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
9703; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9704; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9705; GFX10-NEXT:    buffer_gl1_inv
9706; GFX10-NEXT:    buffer_gl0_inv
9707; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9708; GFX10-NEXT:    v_mov_b32_e32 v4, v3
9709; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
9710; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
9711; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
9712; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
9713; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
9714; GFX10-NEXT:    s_setpc_b64 s[30:31]
9715;
9716; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
9717; GFX90A:       ; %bb.0:
9718; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9719; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
9720; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
9721; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
9722; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
9723; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
9724; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9725; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
9726; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9727; GFX90A-NEXT:    v_not_b32_e32 v6, v4
9728; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
9729; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
9730; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
9731; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9732; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
9733; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
9734; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
9735; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
9736; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
9737; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9738; GFX90A-NEXT:    buffer_wbinvl1
9739; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
9740; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9741; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
9742; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9743; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
9744; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
9745; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
9746; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9747;
9748; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
9749; GFX908:       ; %bb.0:
9750; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9751; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
9752; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
9753; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
9754; GFX908-NEXT:    flat_load_dword v4, v[0:1]
9755; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
9756; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9757; GFX908-NEXT:    s_mov_b32 s4, 0xffff
9758; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
9759; GFX908-NEXT:    v_not_b32_e32 v6, v3
9760; GFX908-NEXT:    s_mov_b64 s[4:5], 0
9761; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
9762; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
9763; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9764; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9765; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
9766; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9767; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
9768; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9769; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9770; GFX908-NEXT:    buffer_wbinvl1
9771; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
9772; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9773; GFX908-NEXT:    v_mov_b32_e32 v4, v3
9774; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9775; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
9776; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
9777; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
9778; GFX908-NEXT:    s_setpc_b64 s[30:31]
9779;
9780; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
9781; GFX8:       ; %bb.0:
9782; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9783; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
9784; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
9785; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
9786; GFX8-NEXT:    flat_load_dword v4, v[0:1]
9787; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
9788; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9789; GFX8-NEXT:    s_mov_b32 s4, 0xffff
9790; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
9791; GFX8-NEXT:    v_not_b32_e32 v6, v3
9792; GFX8-NEXT:    s_mov_b64 s[4:5], 0
9793; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
9794; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
9795; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9796; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9797; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
9798; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
9799; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
9800; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
9801; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9802; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9803; GFX8-NEXT:    buffer_wbinvl1
9804; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
9805; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9806; GFX8-NEXT:    v_mov_b32_e32 v4, v3
9807; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9808; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
9809; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
9810; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
9811; GFX8-NEXT:    s_setpc_b64 s[30:31]
9812;
9813; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
9814; GFX7:       ; %bb.0:
9815; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9816; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
9817; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
9818; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
9819; GFX7-NEXT:    flat_load_dword v3, v[0:1]
9820; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
9821; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
9822; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
9823; GFX7-NEXT:    s_mov_b64 s[4:5], 0
9824; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
9825; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
9826; GFX7-NEXT:    v_not_b32_e32 v6, v2
9827; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
9828; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
9829; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9830; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
9831; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
9832; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
9833; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
9834; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
9835; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
9836; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
9837; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9838; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9839; GFX7-NEXT:    buffer_wbinvl1
9840; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
9841; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9842; GFX7-NEXT:    v_mov_b32_e32 v3, v2
9843; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9844; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
9845; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
9846; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
9847; GFX7-NEXT:    s_setpc_b64 s[30:31]
9848  %gep = getelementptr half, ptr %ptr, i64 -1024
9849  %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
9850  ret void
9851}
9852
9853define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
9854; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
9855; GFX12:       ; %bb.0:
9856; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9857; GFX12-NEXT:    s_wait_expcnt 0x0
9858; GFX12-NEXT:    s_wait_samplecnt 0x0
9859; GFX12-NEXT:    s_wait_bvhcnt 0x0
9860; GFX12-NEXT:    s_wait_kmcnt 0x0
9861; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
9862; GFX12-NEXT:    s_mov_b32 s0, 0
9863; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
9864; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
9865; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9866; GFX12-NEXT:    v_add_f16_e32 v3, v4, v2
9867; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9868; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
9869; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
9870; GFX12-NEXT:    s_wait_storecnt 0x0
9871; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
9872; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9873; GFX12-NEXT:    global_inv scope:SCOPE_DEV
9874; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9875; GFX12-NEXT:    v_mov_b32_e32 v4, v3
9876; GFX12-NEXT:    s_wait_alu 0xfffe
9877; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
9878; GFX12-NEXT:    s_wait_alu 0xfffe
9879; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9880; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
9881; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
9882; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9883; GFX12-NEXT:    s_wait_alu 0xfffe
9884; GFX12-NEXT:    s_setpc_b64 s[30:31]
9885;
9886; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
9887; GFX940:       ; %bb.0:
9888; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9889; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:2046
9890; GFX940-NEXT:    s_mov_b64 s[0:1], 0
9891; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
9892; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
9893; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
9894; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9895; GFX940-NEXT:    v_add_f16_e32 v3, v5, v2
9896; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
9897; GFX940-NEXT:    buffer_wbl2 sc1
9898; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
9899; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9900; GFX940-NEXT:    buffer_inv sc1
9901; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
9902; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
9903; GFX940-NEXT:    v_mov_b32_e32 v5, v3
9904; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
9905; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
9906; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
9907; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
9908; GFX940-NEXT:    s_setpc_b64 s[30:31]
9909;
9910; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
9911; GFX11:       ; %bb.0:
9912; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9913; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
9914; GFX11-NEXT:    s_mov_b32 s0, 0
9915; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
9916; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
9917; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9918; GFX11-NEXT:    v_add_f16_e32 v3, v4, v2
9919; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9920; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
9921; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
9922; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
9923; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
9924; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9925; GFX11-NEXT:    buffer_gl1_inv
9926; GFX11-NEXT:    buffer_gl0_inv
9927; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9928; GFX11-NEXT:    v_mov_b32_e32 v4, v3
9929; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
9930; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9931; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9932; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
9933; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
9934; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9935; GFX11-NEXT:    s_setpc_b64 s[30:31]
9936;
9937; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
9938; GFX10:       ; %bb.0:
9939; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9940; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
9941; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
9942; GFX10-NEXT:    s_mov_b32 s4, 0
9943; GFX10-NEXT:    flat_load_dword v4, v[0:1]
9944; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
9945; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
9946; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9947; GFX10-NEXT:    v_add_f16_e32 v3, v4, v2
9948; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
9949; GFX10-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
9950; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
9951; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
9952; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9953; GFX10-NEXT:    buffer_gl1_inv
9954; GFX10-NEXT:    buffer_gl0_inv
9955; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
9956; GFX10-NEXT:    v_mov_b32_e32 v4, v3
9957; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
9958; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
9959; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
9960; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
9961; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
9962; GFX10-NEXT:    s_setpc_b64 s[30:31]
9963;
9964; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
9965; GFX90A:       ; %bb.0:
9966; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9967; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2046
9968; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
9969; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
9970; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
9971; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
9972; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9973; GFX90A-NEXT:    v_add_f16_e32 v3, v5, v2
9974; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
9975; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
9976; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9977; GFX90A-NEXT:    buffer_wbinvl1
9978; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
9979; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9980; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
9981; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9982; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
9983; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
9984; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
9985; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9986;
9987; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
9988; GFX908:       ; %bb.0:
9989; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9990; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2046
9991; GFX908-NEXT:    s_mov_b64 s[4:5], 0
9992; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
9993; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
9994; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
9995; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9996; GFX908-NEXT:    v_add_f16_e32 v3, v4, v2
9997; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
9998; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
9999; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10000; GFX908-NEXT:    buffer_wbinvl1
10001; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
10002; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10003; GFX908-NEXT:    v_mov_b32_e32 v4, v3
10004; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10005; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
10006; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
10007; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
10008; GFX908-NEXT:    s_setpc_b64 s[30:31]
10009;
10010; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
10011; GFX8:       ; %bb.0:
10012; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10013; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
10014; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10015; GFX8-NEXT:    flat_load_dword v4, v[0:1]
10016; GFX8-NEXT:    s_mov_b64 s[4:5], 0
10017; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
10018; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
10019; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10020; GFX8-NEXT:    v_add_f16_e32 v3, v4, v2
10021; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
10022; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
10023; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
10024; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10025; GFX8-NEXT:    buffer_wbinvl1
10026; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
10027; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10028; GFX8-NEXT:    v_mov_b32_e32 v4, v3
10029; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10030; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
10031; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
10032; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
10033; GFX8-NEXT:    s_setpc_b64 s[30:31]
10034;
10035; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
10036; GFX7:       ; %bb.0:
10037; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10038; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
10039; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10040; GFX7-NEXT:    flat_load_dword v3, v[0:1]
10041; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
10042; GFX7-NEXT:    s_mov_b64 s[4:5], 0
10043; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
10044; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
10045; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
10046; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10047; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
10048; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
10049; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
10050; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
10051; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
10052; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10053; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10054; GFX7-NEXT:    buffer_wbinvl1
10055; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10056; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10057; GFX7-NEXT:    v_mov_b32_e32 v3, v2
10058; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10059; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
10060; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
10061; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
10062; GFX7-NEXT:    s_setpc_b64 s[30:31]
10063  %gep = getelementptr half, ptr %ptr, i64 1023
10064  %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
10065  ret void
10066}
10067
10068define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
10069; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
10070; GFX12:       ; %bb.0:
10071; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10072; GFX12-NEXT:    s_wait_expcnt 0x0
10073; GFX12-NEXT:    s_wait_samplecnt 0x0
10074; GFX12-NEXT:    s_wait_bvhcnt 0x0
10075; GFX12-NEXT:    s_wait_kmcnt 0x0
10076; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
10077; GFX12-NEXT:    s_mov_b32 s0, 0
10078; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
10079; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
10080; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10081; GFX12-NEXT:    v_mov_b32_e32 v4, v3
10082; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10083; GFX12-NEXT:    v_add_f16_e32 v3, v4, v2
10084; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
10085; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10086; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
10087; GFX12-NEXT:    s_wait_storecnt 0x0
10088; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
10089; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10090; GFX12-NEXT:    global_inv scope:SCOPE_DEV
10091; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
10092; GFX12-NEXT:    s_wait_alu 0xfffe
10093; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
10094; GFX12-NEXT:    s_wait_alu 0xfffe
10095; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10096; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
10097; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
10098; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10099; GFX12-NEXT:    v_mov_b32_e32 v0, v3
10100; GFX12-NEXT:    s_wait_alu 0xfffe
10101; GFX12-NEXT:    s_setpc_b64 s[30:31]
10102;
10103; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
10104; GFX940:       ; %bb.0:
10105; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10106; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
10107; GFX940-NEXT:    s_mov_b64 s[0:1], 0
10108; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
10109; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
10110; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
10111; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10112; GFX940-NEXT:    v_mov_b32_e32 v5, v3
10113; GFX940-NEXT:    v_add_f16_e32 v3, v5, v2
10114; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
10115; GFX940-NEXT:    buffer_wbl2 sc1
10116; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
10117; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10118; GFX940-NEXT:    buffer_inv sc1
10119; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
10120; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
10121; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
10122; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
10123; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
10124; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
10125; GFX940-NEXT:    v_mov_b32_e32 v0, v3
10126; GFX940-NEXT:    s_setpc_b64 s[30:31]
10127;
10128; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
10129; GFX11:       ; %bb.0:
10130; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10131; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
10132; GFX11-NEXT:    s_mov_b32 s0, 0
10133; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
10134; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
10135; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10136; GFX11-NEXT:    v_mov_b32_e32 v4, v3
10137; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10138; GFX11-NEXT:    v_add_f16_e32 v3, v4, v2
10139; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
10140; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10141; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
10142; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
10143; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
10144; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10145; GFX11-NEXT:    buffer_gl1_inv
10146; GFX11-NEXT:    buffer_gl0_inv
10147; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
10148; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
10149; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10150; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10151; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
10152; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
10153; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10154; GFX11-NEXT:    v_mov_b32_e32 v0, v3
10155; GFX11-NEXT:    s_setpc_b64 s[30:31]
10156;
10157; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
10158; GFX10:       ; %bb.0:
10159; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10160; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
10161; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
10162; GFX10-NEXT:    s_mov_b32 s4, 0
10163; GFX10-NEXT:    flat_load_dword v0, v[3:4]
10164; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
10165; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
10166; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10167; GFX10-NEXT:    v_mov_b32_e32 v1, v0
10168; GFX10-NEXT:    v_add_f16_e32 v0, v1, v2
10169; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
10170; GFX10-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
10171; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
10172; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
10173; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10174; GFX10-NEXT:    buffer_gl1_inv
10175; GFX10-NEXT:    buffer_gl0_inv
10176; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
10177; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
10178; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
10179; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
10180; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
10181; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
10182; GFX10-NEXT:    s_setpc_b64 s[30:31]
10183;
10184; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
10185; GFX90A:       ; %bb.0:
10186; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10187; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
10188; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
10189; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
10190; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
10191; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
10192; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10193; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
10194; GFX90A-NEXT:    v_add_f16_e32 v3, v5, v2
10195; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
10196; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
10197; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10198; GFX90A-NEXT:    buffer_wbinvl1
10199; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
10200; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10201; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10202; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
10203; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
10204; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
10205; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
10206; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10207;
10208; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
10209; GFX908:       ; %bb.0:
10210; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10211; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
10212; GFX908-NEXT:    s_mov_b64 s[4:5], 0
10213; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
10214; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
10215; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
10216; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10217; GFX908-NEXT:    v_mov_b32_e32 v4, v3
10218; GFX908-NEXT:    v_add_f16_e32 v3, v4, v2
10219; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
10220; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
10221; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10222; GFX908-NEXT:    buffer_wbinvl1
10223; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
10224; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10225; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10226; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
10227; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
10228; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
10229; GFX908-NEXT:    v_mov_b32_e32 v0, v3
10230; GFX908-NEXT:    s_setpc_b64 s[30:31]
10231;
10232; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
10233; GFX8:       ; %bb.0:
10234; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10235; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
10236; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
10237; GFX8-NEXT:    flat_load_dword v0, v[3:4]
10238; GFX8-NEXT:    s_mov_b64 s[4:5], 0
10239; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
10240; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
10241; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10242; GFX8-NEXT:    v_mov_b32_e32 v1, v0
10243; GFX8-NEXT:    v_add_f16_e32 v0, v1, v2
10244; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
10245; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
10246; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
10247; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10248; GFX8-NEXT:    buffer_wbinvl1
10249; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
10250; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10251; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10252; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
10253; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
10254; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
10255; GFX8-NEXT:    s_setpc_b64 s[30:31]
10256;
10257; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
10258; GFX7:       ; %bb.0:
10259; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10260; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
10261; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10262; GFX7-NEXT:    flat_load_dword v3, v[0:1]
10263; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
10264; GFX7-NEXT:    s_mov_b64 s[4:5], 0
10265; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
10266; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
10267; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
10268; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10269; GFX7-NEXT:    v_mov_b32_e32 v4, v3
10270; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
10271; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
10272; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
10273; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
10274; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
10275; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
10276; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10277; GFX7-NEXT:    buffer_wbinvl1
10278; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
10279; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10280; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10281; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
10282; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
10283; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
10284; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
10285; GFX7-NEXT:    s_setpc_b64 s[30:31]
10286  %gep = getelementptr half, ptr %ptr, i64 1023
10287  %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
10288  ret half %result
10289}
10290
10291define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
10292; GFX12-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10293; GFX12:       ; %bb.0:
10294; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10295; GFX12-NEXT:    s_wait_expcnt 0x0
10296; GFX12-NEXT:    s_wait_samplecnt 0x0
10297; GFX12-NEXT:    s_wait_bvhcnt 0x0
10298; GFX12-NEXT:    s_wait_kmcnt 0x0
10299; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
10300; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
10301; GFX12-NEXT:    s_mov_b32 s0, 0
10302; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
10303; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
10304; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
10305; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
10306; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10307; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
10308; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10309; GFX12-NEXT:    v_not_b32_e32 v4, v4
10310; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
10311; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
10312; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10313; GFX12-NEXT:    v_mov_b32_e32 v6, v5
10314; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10315; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
10316; GFX12-NEXT:    v_add_f16_e32 v5, v5, v2
10317; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10318; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
10319; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
10320; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10321; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
10322; GFX12-NEXT:    global_wb scope:SCOPE_SYS
10323; GFX12-NEXT:    s_wait_storecnt 0x0
10324; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
10325; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10326; GFX12-NEXT:    global_inv scope:SCOPE_SYS
10327; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
10328; GFX12-NEXT:    s_wait_alu 0xfffe
10329; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
10330; GFX12-NEXT:    s_wait_alu 0xfffe
10331; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10332; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
10333; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
10334; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10335; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
10336; GFX12-NEXT:    s_wait_alu 0xfffe
10337; GFX12-NEXT:    s_setpc_b64 s[30:31]
10338;
10339; GFX940-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10340; GFX940:       ; %bb.0:
10341; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10342; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
10343; GFX940-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
10344; GFX940-NEXT:    v_and_b32_e32 v0, -4, v6
10345; GFX940-NEXT:    v_mov_b32_e32 v1, v7
10346; GFX940-NEXT:    flat_load_dword v4, v[0:1]
10347; GFX940-NEXT:    v_and_b32_e32 v3, 3, v6
10348; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10349; GFX940-NEXT:    s_mov_b32 s0, 0xffff
10350; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v3, s0
10351; GFX940-NEXT:    v_not_b32_e32 v5, v5
10352; GFX940-NEXT:    s_mov_b64 s[0:1], 0
10353; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
10354; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
10355; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10356; GFX940-NEXT:    v_mov_b32_e32 v7, v4
10357; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
10358; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
10359; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
10360; GFX940-NEXT:    v_and_or_b32 v6, v7, v5, v4
10361; GFX940-NEXT:    buffer_wbl2 sc0 sc1
10362; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1
10363; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10364; GFX940-NEXT:    buffer_inv sc0 sc1
10365; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
10366; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
10367; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
10368; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
10369; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
10370; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
10371; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
10372; GFX940-NEXT:    s_setpc_b64 s[30:31]
10373;
10374; GFX11-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10375; GFX11:       ; %bb.0:
10376; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10377; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
10378; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
10379; GFX11-NEXT:    s_mov_b32 s0, 0
10380; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
10381; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
10382; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
10383; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
10384; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10385; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
10386; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10387; GFX11-NEXT:    v_not_b32_e32 v4, v4
10388; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
10389; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
10390; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10391; GFX11-NEXT:    v_mov_b32_e32 v6, v5
10392; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10393; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
10394; GFX11-NEXT:    v_add_f16_e32 v5, v5, v2
10395; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10396; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
10397; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
10398; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10399; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
10400; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
10401; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
10402; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10403; GFX11-NEXT:    buffer_gl1_inv
10404; GFX11-NEXT:    buffer_gl0_inv
10405; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
10406; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
10407; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10408; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10409; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
10410; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
10411; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10412; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
10413; GFX11-NEXT:    s_setpc_b64 s[30:31]
10414;
10415; GFX10-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10416; GFX10:       ; %bb.0:
10417; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10418; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
10419; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
10420; GFX10-NEXT:    s_mov_b32 s4, 0
10421; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
10422; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
10423; GFX10-NEXT:    flat_load_dword v5, v[0:1]
10424; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10425; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
10426; GFX10-NEXT:    v_not_b32_e32 v4, v4
10427; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
10428; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
10429; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10430; GFX10-NEXT:    v_mov_b32_e32 v6, v5
10431; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
10432; GFX10-NEXT:    v_add_f16_e32 v5, v5, v2
10433; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
10434; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
10435; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
10436; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
10437; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10438; GFX10-NEXT:    buffer_gl1_inv
10439; GFX10-NEXT:    buffer_gl0_inv
10440; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
10441; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
10442; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
10443; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
10444; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
10445; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
10446; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
10447; GFX10-NEXT:    s_setpc_b64 s[30:31]
10448;
10449; GFX90A-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10450; GFX90A:       ; %bb.0:
10451; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10452; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
10453; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10454; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
10455; GFX90A-NEXT:    flat_load_dword v4, v[0:1]
10456; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
10457; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10458; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
10459; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
10460; GFX90A-NEXT:    v_not_b32_e32 v5, v5
10461; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
10462; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
10463; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
10464; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10465; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
10466; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
10467; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
10468; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
10469; GFX90A-NEXT:    v_and_or_b32 v6, v7, v5, v4
10470; GFX90A-NEXT:    buffer_wbl2
10471; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
10472; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10473; GFX90A-NEXT:    buffer_invl2
10474; GFX90A-NEXT:    buffer_wbinvl1
10475; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
10476; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10477; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10478; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
10479; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
10480; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
10481; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
10482; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10483;
10484; GFX908-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10485; GFX908:       ; %bb.0:
10486; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10487; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
10488; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10489; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
10490; GFX908-NEXT:    flat_load_dword v4, v[0:1]
10491; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
10492; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10493; GFX908-NEXT:    s_mov_b32 s4, 0xffff
10494; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v3, s4
10495; GFX908-NEXT:    v_not_b32_e32 v5, v5
10496; GFX908-NEXT:    s_mov_b64 s[4:5], 0
10497; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
10498; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
10499; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10500; GFX908-NEXT:    v_mov_b32_e32 v7, v4
10501; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
10502; GFX908-NEXT:    v_add_f16_e32 v4, v4, v2
10503; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
10504; GFX908-NEXT:    v_and_or_b32 v6, v7, v5, v4
10505; GFX908-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
10506; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10507; GFX908-NEXT:    buffer_wbinvl1
10508; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
10509; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10510; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10511; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
10512; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
10513; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
10514; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
10515; GFX908-NEXT:    s_setpc_b64 s[30:31]
10516;
10517; GFX8-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10518; GFX8:       ; %bb.0:
10519; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10520; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
10521; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10522; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
10523; GFX8-NEXT:    flat_load_dword v5, v[0:1]
10524; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
10525; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10526; GFX8-NEXT:    s_mov_b32 s4, 0xffff
10527; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
10528; GFX8-NEXT:    v_not_b32_e32 v4, v4
10529; GFX8-NEXT:    s_mov_b64 s[4:5], 0
10530; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
10531; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
10532; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10533; GFX8-NEXT:    v_mov_b32_e32 v6, v5
10534; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
10535; GFX8-NEXT:    v_add_f16_e32 v5, v5, v2
10536; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
10537; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
10538; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
10539; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
10540; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10541; GFX8-NEXT:    buffer_wbinvl1
10542; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
10543; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10544; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10545; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
10546; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
10547; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
10548; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
10549; GFX8-NEXT:    s_setpc_b64 s[30:31]
10550;
10551; GFX7-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10552; GFX7:       ; %bb.0:
10553; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10554; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
10555; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10556; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
10557; GFX7-NEXT:    flat_load_dword v5, v[0:1]
10558; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
10559; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
10560; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
10561; GFX7-NEXT:    s_mov_b64 s[4:5], 0
10562; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
10563; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
10564; GFX7-NEXT:    v_not_b32_e32 v4, v4
10565; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
10566; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
10567; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10568; GFX7-NEXT:    v_mov_b32_e32 v6, v5
10569; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
10570; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
10571; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
10572; GFX7-NEXT:    v_add_f32_e32 v5, v5, v3
10573; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
10574; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
10575; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
10576; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
10577; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10578; GFX7-NEXT:    buffer_wbinvl1
10579; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
10580; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10581; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10582; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
10583; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
10584; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
10585; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
10586; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
10587; GFX7-NEXT:    s_setpc_b64 s[30:31]
10588  %gep = getelementptr half, ptr %ptr, i64 1023
10589  %result = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
10590  ret half %result
10591}
10592
10593define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
10594; GFX12-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10595; GFX12:       ; %bb.0:
10596; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10597; GFX12-NEXT:    s_wait_expcnt 0x0
10598; GFX12-NEXT:    s_wait_samplecnt 0x0
10599; GFX12-NEXT:    s_wait_bvhcnt 0x0
10600; GFX12-NEXT:    s_wait_kmcnt 0x0
10601; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
10602; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
10603; GFX12-NEXT:    s_mov_b32 s0, 0
10604; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
10605; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
10606; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
10607; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
10608; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
10609; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
10610; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10611; GFX12-NEXT:    v_not_b32_e32 v6, v3
10612; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
10613; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
10614; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10615; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
10616; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10617; GFX12-NEXT:    v_add_f16_e32 v3, v3, v2
10618; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
10619; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10620; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
10621; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
10622; GFX12-NEXT:    global_wb scope:SCOPE_SYS
10623; GFX12-NEXT:    s_wait_storecnt 0x0
10624; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
10625; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10626; GFX12-NEXT:    global_inv scope:SCOPE_SYS
10627; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
10628; GFX12-NEXT:    v_mov_b32_e32 v4, v3
10629; GFX12-NEXT:    s_wait_alu 0xfffe
10630; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
10631; GFX12-NEXT:    s_wait_alu 0xfffe
10632; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10633; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
10634; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
10635; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10636; GFX12-NEXT:    s_wait_alu 0xfffe
10637; GFX12-NEXT:    s_setpc_b64 s[30:31]
10638;
10639; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10640; GFX940:       ; %bb.0:
10641; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10642; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
10643; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
10644; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
10645; GFX940-NEXT:    v_mov_b32_e32 v1, v5
10646; GFX940-NEXT:    flat_load_dword v5, v[0:1]
10647; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
10648; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10649; GFX940-NEXT:    s_mov_b32 s0, 0xffff
10650; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
10651; GFX940-NEXT:    v_not_b32_e32 v6, v4
10652; GFX940-NEXT:    s_mov_b64 s[0:1], 0
10653; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
10654; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
10655; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10656; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
10657; GFX940-NEXT:    v_add_f16_e32 v4, v4, v2
10658; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
10659; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
10660; GFX940-NEXT:    buffer_wbl2 sc0 sc1
10661; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1
10662; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10663; GFX940-NEXT:    buffer_inv sc0 sc1
10664; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
10665; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
10666; GFX940-NEXT:    v_mov_b32_e32 v5, v4
10667; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
10668; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
10669; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
10670; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
10671; GFX940-NEXT:    s_setpc_b64 s[30:31]
10672;
10673; GFX11-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10674; GFX11:       ; %bb.0:
10675; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10676; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
10677; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
10678; GFX11-NEXT:    s_mov_b32 s0, 0
10679; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
10680; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
10681; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
10682; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
10683; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
10684; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
10685; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10686; GFX11-NEXT:    v_not_b32_e32 v6, v3
10687; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
10688; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
10689; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10690; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
10691; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10692; GFX11-NEXT:    v_add_f16_e32 v3, v3, v2
10693; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
10694; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10695; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
10696; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
10697; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
10698; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
10699; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10700; GFX11-NEXT:    buffer_gl1_inv
10701; GFX11-NEXT:    buffer_gl0_inv
10702; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
10703; GFX11-NEXT:    v_mov_b32_e32 v4, v3
10704; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
10705; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10706; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10707; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
10708; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
10709; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10710; GFX11-NEXT:    s_setpc_b64 s[30:31]
10711;
10712; GFX10-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10713; GFX10:       ; %bb.0:
10714; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10715; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
10716; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
10717; GFX10-NEXT:    s_mov_b32 s4, 0
10718; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
10719; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
10720; GFX10-NEXT:    flat_load_dword v4, v[0:1]
10721; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
10722; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
10723; GFX10-NEXT:    v_not_b32_e32 v6, v3
10724; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
10725; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
10726; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10727; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
10728; GFX10-NEXT:    v_add_f16_e32 v3, v3, v2
10729; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
10730; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
10731; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
10732; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
10733; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10734; GFX10-NEXT:    buffer_gl1_inv
10735; GFX10-NEXT:    buffer_gl0_inv
10736; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
10737; GFX10-NEXT:    v_mov_b32_e32 v4, v3
10738; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
10739; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
10740; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
10741; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
10742; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
10743; GFX10-NEXT:    s_setpc_b64 s[30:31]
10744;
10745; GFX90A-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10746; GFX90A:       ; %bb.0:
10747; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10748; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
10749; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10750; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
10751; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
10752; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
10753; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10754; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
10755; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
10756; GFX90A-NEXT:    v_not_b32_e32 v6, v4
10757; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
10758; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
10759; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
10760; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10761; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
10762; GFX90A-NEXT:    v_add_f16_e32 v4, v4, v2
10763; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
10764; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
10765; GFX90A-NEXT:    buffer_wbl2
10766; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
10767; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10768; GFX90A-NEXT:    buffer_invl2
10769; GFX90A-NEXT:    buffer_wbinvl1
10770; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
10771; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10772; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
10773; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10774; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
10775; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
10776; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
10777; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10778;
10779; GFX908-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10780; GFX908:       ; %bb.0:
10781; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10782; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
10783; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10784; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
10785; GFX908-NEXT:    flat_load_dword v4, v[0:1]
10786; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
10787; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
10788; GFX908-NEXT:    s_mov_b32 s4, 0xffff
10789; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
10790; GFX908-NEXT:    v_not_b32_e32 v6, v3
10791; GFX908-NEXT:    s_mov_b64 s[4:5], 0
10792; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
10793; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
10794; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10795; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
10796; GFX908-NEXT:    v_add_f16_e32 v3, v3, v2
10797; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
10798; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
10799; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
10800; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10801; GFX908-NEXT:    buffer_wbinvl1
10802; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
10803; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10804; GFX908-NEXT:    v_mov_b32_e32 v4, v3
10805; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10806; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
10807; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
10808; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
10809; GFX908-NEXT:    s_setpc_b64 s[30:31]
10810;
10811; GFX8-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10812; GFX8:       ; %bb.0:
10813; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10814; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
10815; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10816; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
10817; GFX8-NEXT:    flat_load_dword v4, v[0:1]
10818; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
10819; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
10820; GFX8-NEXT:    s_mov_b32 s4, 0xffff
10821; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
10822; GFX8-NEXT:    v_not_b32_e32 v6, v3
10823; GFX8-NEXT:    s_mov_b64 s[4:5], 0
10824; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
10825; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
10826; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10827; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
10828; GFX8-NEXT:    v_add_f16_e32 v3, v3, v2
10829; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
10830; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
10831; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
10832; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
10833; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10834; GFX8-NEXT:    buffer_wbinvl1
10835; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
10836; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10837; GFX8-NEXT:    v_mov_b32_e32 v4, v3
10838; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10839; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
10840; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
10841; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
10842; GFX8-NEXT:    s_setpc_b64 s[30:31]
10843;
10844; GFX7-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
10845; GFX7:       ; %bb.0:
10846; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10847; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
10848; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10849; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
10850; GFX7-NEXT:    flat_load_dword v3, v[0:1]
10851; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
10852; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
10853; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10854; GFX7-NEXT:    s_mov_b64 s[4:5], 0
10855; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
10856; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
10857; GFX7-NEXT:    v_not_b32_e32 v6, v2
10858; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
10859; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
10860; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10861; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
10862; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
10863; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
10864; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
10865; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
10866; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
10867; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
10868; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10869; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10870; GFX7-NEXT:    buffer_wbinvl1
10871; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10872; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10873; GFX7-NEXT:    v_mov_b32_e32 v3, v2
10874; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10875; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
10876; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
10877; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
10878; GFX7-NEXT:    s_setpc_b64 s[30:31]
10879  %gep = getelementptr half, ptr %ptr, i64 1023
10880  %unused = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
10881  ret void
10882}
10883
10884; --------------------------------------------------------------------
10885; bfloat
10886; --------------------------------------------------------------------
10887
10888define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
10889; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
10890; GFX12:       ; %bb.0:
10891; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10892; GFX12-NEXT:    s_wait_expcnt 0x0
10893; GFX12-NEXT:    s_wait_samplecnt 0x0
10894; GFX12-NEXT:    s_wait_bvhcnt 0x0
10895; GFX12-NEXT:    s_wait_kmcnt 0x0
10896; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
10897; GFX12-NEXT:    s_mov_b32 s0, 0
10898; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
10899; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
10900; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
10901; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
10902; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10903; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
10904; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10905; GFX12-NEXT:    v_not_b32_e32 v4, v4
10906; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
10907; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
10908; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10909; GFX12-NEXT:    v_mov_b32_e32 v6, v5
10910; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10911; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
10912; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
10913; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10914; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
10915; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
10916; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
10917; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
10918; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
10919; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
10920; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
10921; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10922; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
10923; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
10924; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10925; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
10926; GFX12-NEXT:    s_wait_storecnt 0x0
10927; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
10928; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10929; GFX12-NEXT:    global_inv scope:SCOPE_DEV
10930; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
10931; GFX12-NEXT:    s_wait_alu 0xfffe
10932; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
10933; GFX12-NEXT:    s_wait_alu 0xfffe
10934; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10935; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
10936; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
10937; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10938; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
10939; GFX12-NEXT:    s_wait_alu 0xfffe
10940; GFX12-NEXT:    s_setpc_b64 s[30:31]
10941;
10942; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
10943; GFX940:       ; %bb.0:
10944; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10945; GFX940-NEXT:    v_mov_b32_e32 v3, v0
10946; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
10947; GFX940-NEXT:    flat_load_dword v5, v[0:1]
10948; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
10949; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10950; GFX940-NEXT:    s_mov_b32 s0, 0xffff
10951; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
10952; GFX940-NEXT:    v_not_b32_e32 v4, v4
10953; GFX940-NEXT:    s_mov_b64 s[0:1], 0
10954; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10955; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
10956; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
10957; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
10958; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10959; GFX940-NEXT:    v_mov_b32_e32 v7, v5
10960; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10961; GFX940-NEXT:    s_nop 0
10962; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
10963; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
10964; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
10965; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
10966; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
10967; GFX940-NEXT:    s_nop 1
10968; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
10969; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10970; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
10971; GFX940-NEXT:    buffer_wbl2 sc1
10972; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
10973; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10974; GFX940-NEXT:    buffer_inv sc1
10975; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
10976; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
10977; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
10978; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
10979; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
10980; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
10981; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
10982; GFX940-NEXT:    s_setpc_b64 s[30:31]
10983;
10984; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
10985; GFX11:       ; %bb.0:
10986; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10987; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
10988; GFX11-NEXT:    s_mov_b32 s0, 0
10989; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
10990; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
10991; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
10992; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
10993; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10994; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
10995; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10996; GFX11-NEXT:    v_not_b32_e32 v4, v4
10997; GFX11-NEXT:    .p2align 6
10998; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
10999; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
11000; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11001; GFX11-NEXT:    v_mov_b32_e32 v6, v5
11002; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11003; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11004; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11005; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11006; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
11007; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
11008; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11009; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11010; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
11011; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11012; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11013; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11014; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11015; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11016; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11017; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
11018; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
11019; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
11020; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11021; GFX11-NEXT:    buffer_gl1_inv
11022; GFX11-NEXT:    buffer_gl0_inv
11023; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11024; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
11025; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11026; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11027; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
11028; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
11029; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11030; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11031; GFX11-NEXT:    s_setpc_b64 s[30:31]
11032;
11033; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
11034; GFX10:       ; %bb.0:
11035; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11036; GFX10-NEXT:    v_mov_b32_e32 v3, v0
11037; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11038; GFX10-NEXT:    s_mov_b32 s4, 0
11039; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
11040; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
11041; GFX10-NEXT:    flat_load_dword v5, v[0:1]
11042; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11043; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
11044; GFX10-NEXT:    v_not_b32_e32 v4, v4
11045; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
11046; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
11047; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11048; GFX10-NEXT:    v_mov_b32_e32 v6, v5
11049; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11050; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
11051; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
11052; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11053; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11054; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11055; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11056; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11057; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
11058; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
11059; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11060; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11061; GFX10-NEXT:    buffer_gl1_inv
11062; GFX10-NEXT:    buffer_gl0_inv
11063; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11064; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
11065; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
11066; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
11067; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
11068; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
11069; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11070; GFX10-NEXT:    s_setpc_b64 s[30:31]
11071;
11072; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
11073; GFX90A:       ; %bb.0:
11074; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11075; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
11076; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
11077; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
11078; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
11079; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11080; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
11081; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11082; GFX90A-NEXT:    v_not_b32_e32 v4, v4
11083; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
11084; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11085; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
11086; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
11087; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
11088; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11089; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
11090; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11091; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
11092; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
11093; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11094; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
11095; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11096; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
11097; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11098; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
11099; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
11100; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11101; GFX90A-NEXT:    buffer_wbinvl1
11102; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
11103; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11104; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11105; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
11106; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
11107; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
11108; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11109; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11110;
11111; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
11112; GFX908:       ; %bb.0:
11113; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11114; GFX908-NEXT:    v_mov_b32_e32 v3, v0
11115; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
11116; GFX908-NEXT:    flat_load_dword v5, v[0:1]
11117; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
11118; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11119; GFX908-NEXT:    s_mov_b32 s4, 0xffff
11120; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11121; GFX908-NEXT:    v_not_b32_e32 v4, v4
11122; GFX908-NEXT:    s_mov_b64 s[4:5], 0
11123; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11124; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
11125; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
11126; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
11127; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11128; GFX908-NEXT:    v_mov_b32_e32 v6, v5
11129; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11130; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
11131; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
11132; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11133; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
11134; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11135; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
11136; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11137; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
11138; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11139; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11140; GFX908-NEXT:    buffer_wbinvl1
11141; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11142; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11143; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11144; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
11145; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
11146; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
11147; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11148; GFX908-NEXT:    s_setpc_b64 s[30:31]
11149;
11150; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
11151; GFX8:       ; %bb.0:
11152; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11153; GFX8-NEXT:    v_mov_b32_e32 v3, v0
11154; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
11155; GFX8-NEXT:    flat_load_dword v5, v[0:1]
11156; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
11157; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11158; GFX8-NEXT:    s_mov_b32 s4, 0xffff
11159; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11160; GFX8-NEXT:    v_not_b32_e32 v4, v4
11161; GFX8-NEXT:    s_mov_b64 s[4:5], 0
11162; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11163; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
11164; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
11165; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11166; GFX8-NEXT:    v_mov_b32_e32 v6, v5
11167; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11168; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
11169; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
11170; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
11171; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
11172; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11173; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11174; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
11175; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
11176; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11177; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
11178; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11179; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11180; GFX8-NEXT:    buffer_wbinvl1
11181; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11182; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11183; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11184; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
11185; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
11186; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
11187; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11188; GFX8-NEXT:    s_setpc_b64 s[30:31]
11189;
11190; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
11191; GFX7:       ; %bb.0:
11192; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11193; GFX7-NEXT:    v_mov_b32_e32 v3, v0
11194; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
11195; GFX7-NEXT:    flat_load_dword v5, v[0:1]
11196; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
11197; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11198; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
11199; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
11200; GFX7-NEXT:    v_not_b32_e32 v4, v4
11201; GFX7-NEXT:    s_mov_b64 s[4:5], 0
11202; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
11203; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
11204; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
11205; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11206; GFX7-NEXT:    v_mov_b32_e32 v6, v5
11207; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11208; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11209; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
11210; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11211; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
11212; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11213; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
11214; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11215; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11216; GFX7-NEXT:    buffer_wbinvl1
11217; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11218; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11219; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11220; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
11221; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
11222; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
11223; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11224; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
11225; GFX7-NEXT:    s_setpc_b64 s[30:31]
11226  %result = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
11227  ret bfloat %result
11228}
11229
11230define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
11231; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11232; GFX12:       ; %bb.0:
11233; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11234; GFX12-NEXT:    s_wait_expcnt 0x0
11235; GFX12-NEXT:    s_wait_samplecnt 0x0
11236; GFX12-NEXT:    s_wait_bvhcnt 0x0
11237; GFX12-NEXT:    s_wait_kmcnt 0x0
11238; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
11239; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11240; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11241; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
11242; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
11243; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
11244; GFX12-NEXT:    s_mov_b32 s0, 0
11245; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
11246; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11247; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
11248; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11249; GFX12-NEXT:    v_not_b32_e32 v4, v4
11250; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
11251; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
11252; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11253; GFX12-NEXT:    v_mov_b32_e32 v6, v5
11254; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11255; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11256; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11257; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11258; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
11259; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
11260; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11261; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11262; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
11263; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11264; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11265; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11266; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11267; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11268; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11269; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
11270; GFX12-NEXT:    s_wait_storecnt 0x0
11271; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
11272; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11273; GFX12-NEXT:    global_inv scope:SCOPE_DEV
11274; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11275; GFX12-NEXT:    s_wait_alu 0xfffe
11276; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
11277; GFX12-NEXT:    s_wait_alu 0xfffe
11278; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11279; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
11280; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
11281; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11282; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11283; GFX12-NEXT:    s_wait_alu 0xfffe
11284; GFX12-NEXT:    s_setpc_b64 s[30:31]
11285;
11286; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11287; GFX940:       ; %bb.0:
11288; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11289; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
11290; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
11291; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
11292; GFX940-NEXT:    v_mov_b32_e32 v1, v5
11293; GFX940-NEXT:    flat_load_dword v5, v[0:1]
11294; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
11295; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11296; GFX940-NEXT:    s_mov_b32 s0, 0xffff
11297; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
11298; GFX940-NEXT:    v_not_b32_e32 v4, v4
11299; GFX940-NEXT:    s_mov_b64 s[0:1], 0
11300; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11301; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
11302; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
11303; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
11304; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11305; GFX940-NEXT:    v_mov_b32_e32 v7, v5
11306; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11307; GFX940-NEXT:    s_nop 0
11308; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
11309; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
11310; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11311; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
11312; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11313; GFX940-NEXT:    s_nop 1
11314; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
11315; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11316; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
11317; GFX940-NEXT:    buffer_wbl2 sc1
11318; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
11319; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11320; GFX940-NEXT:    buffer_inv sc1
11321; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
11322; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
11323; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
11324; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
11325; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
11326; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
11327; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11328; GFX940-NEXT:    s_setpc_b64 s[30:31]
11329;
11330; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11331; GFX11:       ; %bb.0:
11332; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11333; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
11334; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11335; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11336; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
11337; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
11338; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
11339; GFX11-NEXT:    s_mov_b32 s0, 0
11340; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
11341; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11342; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
11343; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11344; GFX11-NEXT:    v_not_b32_e32 v4, v4
11345; GFX11-NEXT:    .p2align 6
11346; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
11347; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
11348; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11349; GFX11-NEXT:    v_mov_b32_e32 v6, v5
11350; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11351; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11352; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11353; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11354; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
11355; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
11356; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11357; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11358; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
11359; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11360; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11361; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11362; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11363; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11364; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11365; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
11366; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
11367; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
11368; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11369; GFX11-NEXT:    buffer_gl1_inv
11370; GFX11-NEXT:    buffer_gl0_inv
11371; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11372; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
11373; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11374; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11375; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
11376; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
11377; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11378; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11379; GFX11-NEXT:    s_setpc_b64 s[30:31]
11380;
11381; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11382; GFX10:       ; %bb.0:
11383; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11384; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
11385; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11386; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11387; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
11388; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
11389; GFX10-NEXT:    s_mov_b32 s4, 0
11390; GFX10-NEXT:    flat_load_dword v5, v[0:1]
11391; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11392; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
11393; GFX10-NEXT:    v_not_b32_e32 v4, v4
11394; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
11395; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
11396; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11397; GFX10-NEXT:    v_mov_b32_e32 v6, v5
11398; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11399; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
11400; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
11401; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11402; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11403; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11404; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11405; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11406; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
11407; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
11408; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11409; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11410; GFX10-NEXT:    buffer_gl1_inv
11411; GFX10-NEXT:    buffer_gl0_inv
11412; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11413; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
11414; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
11415; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
11416; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
11417; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
11418; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11419; GFX10-NEXT:    s_setpc_b64 s[30:31]
11420;
11421; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11422; GFX90A:       ; %bb.0:
11423; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11424; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
11425; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
11426; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
11427; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
11428; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
11429; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11430; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
11431; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11432; GFX90A-NEXT:    v_not_b32_e32 v4, v4
11433; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
11434; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11435; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
11436; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
11437; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
11438; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11439; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
11440; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11441; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
11442; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
11443; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11444; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
11445; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11446; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
11447; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11448; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
11449; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
11450; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11451; GFX90A-NEXT:    buffer_wbinvl1
11452; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
11453; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11454; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11455; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
11456; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
11457; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
11458; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11459; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11460;
11461; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11462; GFX908:       ; %bb.0:
11463; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11464; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
11465; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
11466; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
11467; GFX908-NEXT:    flat_load_dword v5, v[0:1]
11468; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
11469; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11470; GFX908-NEXT:    s_mov_b32 s4, 0xffff
11471; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11472; GFX908-NEXT:    v_not_b32_e32 v4, v4
11473; GFX908-NEXT:    s_mov_b64 s[4:5], 0
11474; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11475; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
11476; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
11477; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
11478; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11479; GFX908-NEXT:    v_mov_b32_e32 v6, v5
11480; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11481; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
11482; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
11483; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11484; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
11485; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11486; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
11487; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11488; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
11489; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11490; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11491; GFX908-NEXT:    buffer_wbinvl1
11492; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11493; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11494; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11495; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
11496; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
11497; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
11498; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11499; GFX908-NEXT:    s_setpc_b64 s[30:31]
11500;
11501; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11502; GFX8:       ; %bb.0:
11503; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11504; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
11505; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11506; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
11507; GFX8-NEXT:    flat_load_dword v5, v[0:1]
11508; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
11509; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11510; GFX8-NEXT:    s_mov_b32 s4, 0xffff
11511; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11512; GFX8-NEXT:    v_not_b32_e32 v4, v4
11513; GFX8-NEXT:    s_mov_b64 s[4:5], 0
11514; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11515; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
11516; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
11517; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11518; GFX8-NEXT:    v_mov_b32_e32 v6, v5
11519; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11520; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
11521; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
11522; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
11523; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
11524; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11525; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11526; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
11527; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
11528; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11529; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
11530; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11531; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11532; GFX8-NEXT:    buffer_wbinvl1
11533; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11534; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11535; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11536; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
11537; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
11538; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
11539; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11540; GFX8-NEXT:    s_setpc_b64 s[30:31]
11541;
11542; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11543; GFX7:       ; %bb.0:
11544; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11545; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
11546; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11547; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
11548; GFX7-NEXT:    flat_load_dword v5, v[0:1]
11549; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
11550; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11551; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
11552; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
11553; GFX7-NEXT:    v_not_b32_e32 v4, v4
11554; GFX7-NEXT:    s_mov_b64 s[4:5], 0
11555; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
11556; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
11557; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
11558; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11559; GFX7-NEXT:    v_mov_b32_e32 v6, v5
11560; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11561; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11562; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
11563; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11564; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
11565; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11566; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
11567; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11568; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11569; GFX7-NEXT:    buffer_wbinvl1
11570; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11571; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11572; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11573; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
11574; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
11575; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
11576; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11577; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
11578; GFX7-NEXT:    s_setpc_b64 s[30:31]
11579  %gep = getelementptr bfloat, ptr %ptr, i64 1023
11580  %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
11581  ret bfloat %result
11582}
11583
11584define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
11585; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
11586; GFX12:       ; %bb.0:
11587; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11588; GFX12-NEXT:    s_wait_expcnt 0x0
11589; GFX12-NEXT:    s_wait_samplecnt 0x0
11590; GFX12-NEXT:    s_wait_bvhcnt 0x0
11591; GFX12-NEXT:    s_wait_kmcnt 0x0
11592; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
11593; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
11594; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11595; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
11596; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
11597; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
11598; GFX12-NEXT:    s_mov_b32 s0, 0
11599; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
11600; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11601; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
11602; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11603; GFX12-NEXT:    v_not_b32_e32 v4, v4
11604; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
11605; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
11606; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11607; GFX12-NEXT:    v_mov_b32_e32 v6, v5
11608; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11609; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11610; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11611; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11612; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
11613; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
11614; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11615; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11616; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
11617; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11618; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11619; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11620; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11621; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11622; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11623; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
11624; GFX12-NEXT:    s_wait_storecnt 0x0
11625; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
11626; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11627; GFX12-NEXT:    global_inv scope:SCOPE_DEV
11628; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11629; GFX12-NEXT:    s_wait_alu 0xfffe
11630; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
11631; GFX12-NEXT:    s_wait_alu 0xfffe
11632; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11633; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
11634; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
11635; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11636; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11637; GFX12-NEXT:    s_wait_alu 0xfffe
11638; GFX12-NEXT:    s_setpc_b64 s[30:31]
11639;
11640; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
11641; GFX940:       ; %bb.0:
11642; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11643; GFX940-NEXT:    s_movk_i32 s0, 0xf800
11644; GFX940-NEXT:    s_mov_b32 s1, -1
11645; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
11646; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
11647; GFX940-NEXT:    v_mov_b32_e32 v1, v5
11648; GFX940-NEXT:    flat_load_dword v5, v[0:1]
11649; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
11650; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11651; GFX940-NEXT:    s_mov_b32 s0, 0xffff
11652; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
11653; GFX940-NEXT:    v_not_b32_e32 v4, v4
11654; GFX940-NEXT:    s_mov_b64 s[0:1], 0
11655; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11656; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
11657; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
11658; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
11659; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11660; GFX940-NEXT:    v_mov_b32_e32 v7, v5
11661; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11662; GFX940-NEXT:    s_nop 0
11663; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
11664; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
11665; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11666; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
11667; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11668; GFX940-NEXT:    s_nop 1
11669; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
11670; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11671; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
11672; GFX940-NEXT:    buffer_wbl2 sc1
11673; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
11674; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11675; GFX940-NEXT:    buffer_inv sc1
11676; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
11677; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
11678; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
11679; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
11680; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
11681; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
11682; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11683; GFX940-NEXT:    s_setpc_b64 s[30:31]
11684;
11685; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
11686; GFX11:       ; %bb.0:
11687; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11688; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
11689; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
11690; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11691; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
11692; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
11693; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
11694; GFX11-NEXT:    s_mov_b32 s0, 0
11695; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
11696; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11697; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
11698; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11699; GFX11-NEXT:    v_not_b32_e32 v4, v4
11700; GFX11-NEXT:    .p2align 6
11701; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
11702; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
11703; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11704; GFX11-NEXT:    v_mov_b32_e32 v6, v5
11705; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11706; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11707; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11708; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11709; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
11710; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
11711; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11712; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11713; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
11714; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11715; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11716; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11717; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11718; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11719; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11720; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
11721; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
11722; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
11723; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11724; GFX11-NEXT:    buffer_gl1_inv
11725; GFX11-NEXT:    buffer_gl0_inv
11726; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11727; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
11728; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11729; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11730; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
11731; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
11732; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11733; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11734; GFX11-NEXT:    s_setpc_b64 s[30:31]
11735;
11736; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
11737; GFX10:       ; %bb.0:
11738; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11739; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
11740; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
11741; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11742; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
11743; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
11744; GFX10-NEXT:    s_mov_b32 s4, 0
11745; GFX10-NEXT:    flat_load_dword v5, v[0:1]
11746; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11747; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
11748; GFX10-NEXT:    v_not_b32_e32 v4, v4
11749; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
11750; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
11751; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11752; GFX10-NEXT:    v_mov_b32_e32 v6, v5
11753; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11754; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
11755; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
11756; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11757; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11758; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11759; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11760; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11761; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
11762; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
11763; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11764; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11765; GFX10-NEXT:    buffer_gl1_inv
11766; GFX10-NEXT:    buffer_gl0_inv
11767; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11768; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
11769; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
11770; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
11771; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
11772; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
11773; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11774; GFX10-NEXT:    s_setpc_b64 s[30:31]
11775;
11776; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
11777; GFX90A:       ; %bb.0:
11778; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11779; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
11780; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
11781; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
11782; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
11783; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
11784; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11785; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
11786; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11787; GFX90A-NEXT:    v_not_b32_e32 v4, v4
11788; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
11789; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11790; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
11791; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
11792; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
11793; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11794; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
11795; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11796; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
11797; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
11798; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11799; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
11800; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11801; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
11802; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11803; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
11804; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
11805; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11806; GFX90A-NEXT:    buffer_wbinvl1
11807; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
11808; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11809; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11810; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
11811; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
11812; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
11813; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11814; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11815;
11816; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
11817; GFX908:       ; %bb.0:
11818; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11819; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
11820; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
11821; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
11822; GFX908-NEXT:    flat_load_dword v5, v[0:1]
11823; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
11824; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11825; GFX908-NEXT:    s_mov_b32 s4, 0xffff
11826; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11827; GFX908-NEXT:    v_not_b32_e32 v4, v4
11828; GFX908-NEXT:    s_mov_b64 s[4:5], 0
11829; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11830; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
11831; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
11832; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
11833; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11834; GFX908-NEXT:    v_mov_b32_e32 v6, v5
11835; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11836; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
11837; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
11838; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11839; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
11840; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11841; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
11842; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11843; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
11844; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11845; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11846; GFX908-NEXT:    buffer_wbinvl1
11847; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11848; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11849; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11850; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
11851; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
11852; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
11853; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11854; GFX908-NEXT:    s_setpc_b64 s[30:31]
11855;
11856; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
11857; GFX8:       ; %bb.0:
11858; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11859; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
11860; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
11861; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
11862; GFX8-NEXT:    flat_load_dword v5, v[0:1]
11863; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
11864; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11865; GFX8-NEXT:    s_mov_b32 s4, 0xffff
11866; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11867; GFX8-NEXT:    v_not_b32_e32 v4, v4
11868; GFX8-NEXT:    s_mov_b64 s[4:5], 0
11869; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11870; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
11871; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
11872; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11873; GFX8-NEXT:    v_mov_b32_e32 v6, v5
11874; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11875; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
11876; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
11877; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
11878; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
11879; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11880; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11881; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
11882; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
11883; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11884; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
11885; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11886; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11887; GFX8-NEXT:    buffer_wbinvl1
11888; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11889; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11890; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11891; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
11892; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
11893; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
11894; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11895; GFX8-NEXT:    s_setpc_b64 s[30:31]
11896;
11897; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
11898; GFX7:       ; %bb.0:
11899; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11900; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
11901; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
11902; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
11903; GFX7-NEXT:    flat_load_dword v5, v[0:1]
11904; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
11905; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11906; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
11907; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
11908; GFX7-NEXT:    v_not_b32_e32 v4, v4
11909; GFX7-NEXT:    s_mov_b64 s[4:5], 0
11910; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
11911; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
11912; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
11913; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11914; GFX7-NEXT:    v_mov_b32_e32 v6, v5
11915; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11916; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11917; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
11918; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11919; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
11920; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11921; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
11922; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11923; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11924; GFX7-NEXT:    buffer_wbinvl1
11925; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11926; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11927; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11928; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
11929; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
11930; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
11931; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11932; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
11933; GFX7-NEXT:    s_setpc_b64 s[30:31]
11934  %gep = getelementptr bfloat, ptr %ptr, i64 -1024
11935  %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
11936  ret bfloat %result
11937 }
11938
11939define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
11940; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11941; GFX12:       ; %bb.0:
11942; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11943; GFX12-NEXT:    s_wait_expcnt 0x0
11944; GFX12-NEXT:    s_wait_samplecnt 0x0
11945; GFX12-NEXT:    s_wait_bvhcnt 0x0
11946; GFX12-NEXT:    s_wait_kmcnt 0x0
11947; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
11948; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11949; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
11950; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
11951; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
11952; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
11953; GFX12-NEXT:    s_mov_b32 s0, 0
11954; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
11955; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
11956; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
11957; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11958; GFX12-NEXT:    v_not_b32_e32 v5, v5
11959; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
11960; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
11961; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11962; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
11963; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11964; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11965; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
11966; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
11967; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
11968; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
11969; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
11970; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
11971; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11972; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
11973; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11974; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11975; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
11976; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
11977; GFX12-NEXT:    s_wait_storecnt 0x0
11978; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
11979; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11980; GFX12-NEXT:    global_inv scope:SCOPE_DEV
11981; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
11982; GFX12-NEXT:    v_mov_b32_e32 v3, v2
11983; GFX12-NEXT:    s_wait_alu 0xfffe
11984; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
11985; GFX12-NEXT:    s_wait_alu 0xfffe
11986; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11987; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
11988; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
11989; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11990; GFX12-NEXT:    s_wait_alu 0xfffe
11991; GFX12-NEXT:    s_setpc_b64 s[30:31]
11992;
11993; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11994; GFX940:       ; %bb.0:
11995; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11996; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
11997; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
11998; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
11999; GFX940-NEXT:    v_mov_b32_e32 v1, v5
12000; GFX940-NEXT:    flat_load_dword v3, v[0:1]
12001; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
12002; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12003; GFX940-NEXT:    s_mov_b32 s0, 0xffff
12004; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
12005; GFX940-NEXT:    v_not_b32_e32 v5, v5
12006; GFX940-NEXT:    s_mov_b64 s[0:1], 0
12007; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12008; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
12009; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
12010; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
12011; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12012; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12013; GFX940-NEXT:    s_nop 0
12014; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
12015; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
12016; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12017; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
12018; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12019; GFX940-NEXT:    s_nop 1
12020; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
12021; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12022; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
12023; GFX940-NEXT:    buffer_wbl2 sc1
12024; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
12025; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12026; GFX940-NEXT:    buffer_inv sc1
12027; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12028; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
12029; GFX940-NEXT:    v_mov_b32_e32 v3, v2
12030; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
12031; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
12032; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
12033; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
12034; GFX940-NEXT:    s_setpc_b64 s[30:31]
12035;
12036; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
12037; GFX11:       ; %bb.0:
12038; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12039; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
12040; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
12041; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12042; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
12043; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
12044; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
12045; GFX11-NEXT:    s_mov_b32 s0, 0
12046; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
12047; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12048; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
12049; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12050; GFX11-NEXT:    v_not_b32_e32 v5, v5
12051; GFX11-NEXT:    .p2align 6
12052; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
12053; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
12054; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12055; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
12056; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12057; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
12058; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
12059; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
12060; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
12061; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12062; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
12063; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
12064; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12065; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
12066; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
12067; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12068; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
12069; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
12070; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
12071; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
12072; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12073; GFX11-NEXT:    buffer_gl1_inv
12074; GFX11-NEXT:    buffer_gl0_inv
12075; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
12076; GFX11-NEXT:    v_mov_b32_e32 v3, v2
12077; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
12078; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12079; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12080; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
12081; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
12082; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12083; GFX11-NEXT:    s_setpc_b64 s[30:31]
12084;
12085; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
12086; GFX10:       ; %bb.0:
12087; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12088; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
12089; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
12090; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12091; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
12092; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
12093; GFX10-NEXT:    s_mov_b32 s4, 0
12094; GFX10-NEXT:    flat_load_dword v3, v[0:1]
12095; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12096; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
12097; GFX10-NEXT:    v_not_b32_e32 v5, v5
12098; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
12099; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
12100; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12101; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12102; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
12103; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
12104; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12105; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
12106; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
12107; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
12108; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12109; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
12110; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
12111; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12112; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12113; GFX10-NEXT:    buffer_gl1_inv
12114; GFX10-NEXT:    buffer_gl0_inv
12115; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
12116; GFX10-NEXT:    v_mov_b32_e32 v3, v2
12117; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
12118; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
12119; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
12120; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
12121; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
12122; GFX10-NEXT:    s_setpc_b64 s[30:31]
12123;
12124; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
12125; GFX90A:       ; %bb.0:
12126; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12127; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
12128; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
12129; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
12130; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
12131; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
12132; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12133; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
12134; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
12135; GFX90A-NEXT:    v_not_b32_e32 v5, v5
12136; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
12137; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12138; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
12139; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
12140; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
12141; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12142; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12143; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
12144; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
12145; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12146; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
12147; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12148; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
12149; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12150; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
12151; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12152; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12153; GFX90A-NEXT:    buffer_wbinvl1
12154; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12155; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12156; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
12157; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12158; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
12159; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
12160; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
12161; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12162;
12163; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
12164; GFX908:       ; %bb.0:
12165; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12166; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
12167; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
12168; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
12169; GFX908-NEXT:    flat_load_dword v3, v[0:1]
12170; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
12171; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12172; GFX908-NEXT:    s_mov_b32 s4, 0xffff
12173; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
12174; GFX908-NEXT:    v_not_b32_e32 v5, v5
12175; GFX908-NEXT:    s_mov_b64 s[4:5], 0
12176; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12177; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
12178; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
12179; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
12180; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12181; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12182; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
12183; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
12184; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12185; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
12186; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12187; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
12188; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12189; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
12190; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12191; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12192; GFX908-NEXT:    buffer_wbinvl1
12193; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12194; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12195; GFX908-NEXT:    v_mov_b32_e32 v3, v2
12196; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12197; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
12198; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
12199; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
12200; GFX908-NEXT:    s_setpc_b64 s[30:31]
12201;
12202; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
12203; GFX8:       ; %bb.0:
12204; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12205; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
12206; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
12207; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
12208; GFX8-NEXT:    flat_load_dword v3, v[0:1]
12209; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
12210; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12211; GFX8-NEXT:    s_mov_b32 s4, 0xffff
12212; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
12213; GFX8-NEXT:    v_not_b32_e32 v5, v5
12214; GFX8-NEXT:    s_mov_b64 s[4:5], 0
12215; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12216; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
12217; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
12218; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12219; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12220; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
12221; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
12222; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
12223; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
12224; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
12225; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12226; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
12227; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
12228; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12229; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
12230; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12231; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12232; GFX8-NEXT:    buffer_wbinvl1
12233; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12234; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12235; GFX8-NEXT:    v_mov_b32_e32 v3, v2
12236; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12237; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
12238; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
12239; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
12240; GFX8-NEXT:    s_setpc_b64 s[30:31]
12241;
12242; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
12243; GFX7:       ; %bb.0:
12244; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12245; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
12246; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
12247; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
12248; GFX7-NEXT:    flat_load_dword v3, v[0:1]
12249; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
12250; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12251; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
12252; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
12253; GFX7-NEXT:    v_not_b32_e32 v5, v5
12254; GFX7-NEXT:    s_mov_b64 s[4:5], 0
12255; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
12256; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
12257; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
12258; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12259; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
12260; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
12261; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
12262; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
12263; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
12264; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
12265; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
12266; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12267; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12268; GFX7-NEXT:    buffer_wbinvl1
12269; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12270; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12271; GFX7-NEXT:    v_mov_b32_e32 v3, v2
12272; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12273; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
12274; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
12275; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
12276; GFX7-NEXT:    s_setpc_b64 s[30:31]
12277  %gep = getelementptr bfloat, ptr %ptr, i64 1023
12278  %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
12279  ret void
12280}
12281
12282define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
12283; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
12284; GFX12:       ; %bb.0:
12285; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12286; GFX12-NEXT:    s_wait_expcnt 0x0
12287; GFX12-NEXT:    s_wait_samplecnt 0x0
12288; GFX12-NEXT:    s_wait_bvhcnt 0x0
12289; GFX12-NEXT:    s_wait_kmcnt 0x0
12290; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
12291; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
12292; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12293; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
12294; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
12295; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
12296; GFX12-NEXT:    s_mov_b32 s0, 0
12297; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
12298; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12299; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
12300; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12301; GFX12-NEXT:    v_not_b32_e32 v5, v5
12302; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
12303; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
12304; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12305; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
12306; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12307; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
12308; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
12309; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
12310; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
12311; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12312; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
12313; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
12314; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12315; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
12316; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
12317; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12318; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
12319; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
12320; GFX12-NEXT:    s_wait_storecnt 0x0
12321; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12322; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12323; GFX12-NEXT:    global_inv scope:SCOPE_DEV
12324; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
12325; GFX12-NEXT:    v_mov_b32_e32 v3, v2
12326; GFX12-NEXT:    s_wait_alu 0xfffe
12327; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
12328; GFX12-NEXT:    s_wait_alu 0xfffe
12329; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12330; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
12331; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
12332; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12333; GFX12-NEXT:    s_wait_alu 0xfffe
12334; GFX12-NEXT:    s_setpc_b64 s[30:31]
12335;
12336; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
12337; GFX940:       ; %bb.0:
12338; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12339; GFX940-NEXT:    s_movk_i32 s0, 0xf800
12340; GFX940-NEXT:    s_mov_b32 s1, -1
12341; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
12342; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
12343; GFX940-NEXT:    v_mov_b32_e32 v1, v5
12344; GFX940-NEXT:    flat_load_dword v3, v[0:1]
12345; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
12346; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12347; GFX940-NEXT:    s_mov_b32 s0, 0xffff
12348; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
12349; GFX940-NEXT:    v_not_b32_e32 v5, v5
12350; GFX940-NEXT:    s_mov_b64 s[0:1], 0
12351; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12352; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
12353; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
12354; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
12355; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12356; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12357; GFX940-NEXT:    s_nop 0
12358; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
12359; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
12360; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12361; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
12362; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12363; GFX940-NEXT:    s_nop 1
12364; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
12365; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12366; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
12367; GFX940-NEXT:    buffer_wbl2 sc1
12368; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
12369; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12370; GFX940-NEXT:    buffer_inv sc1
12371; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12372; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
12373; GFX940-NEXT:    v_mov_b32_e32 v3, v2
12374; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
12375; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
12376; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
12377; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
12378; GFX940-NEXT:    s_setpc_b64 s[30:31]
12379;
12380; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
12381; GFX11:       ; %bb.0:
12382; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12383; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
12384; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
12385; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12386; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
12387; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
12388; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
12389; GFX11-NEXT:    s_mov_b32 s0, 0
12390; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
12391; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12392; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
12393; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12394; GFX11-NEXT:    v_not_b32_e32 v5, v5
12395; GFX11-NEXT:    .p2align 6
12396; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
12397; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
12398; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12399; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
12400; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12401; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
12402; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
12403; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
12404; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
12405; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12406; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
12407; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
12408; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12409; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
12410; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
12411; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12412; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
12413; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
12414; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
12415; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
12416; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12417; GFX11-NEXT:    buffer_gl1_inv
12418; GFX11-NEXT:    buffer_gl0_inv
12419; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
12420; GFX11-NEXT:    v_mov_b32_e32 v3, v2
12421; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
12422; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12423; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12424; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
12425; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
12426; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12427; GFX11-NEXT:    s_setpc_b64 s[30:31]
12428;
12429; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
12430; GFX10:       ; %bb.0:
12431; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12432; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
12433; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
12434; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12435; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
12436; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
12437; GFX10-NEXT:    s_mov_b32 s4, 0
12438; GFX10-NEXT:    flat_load_dword v3, v[0:1]
12439; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12440; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
12441; GFX10-NEXT:    v_not_b32_e32 v5, v5
12442; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
12443; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
12444; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12445; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12446; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
12447; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
12448; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12449; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
12450; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
12451; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
12452; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12453; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
12454; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
12455; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12456; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12457; GFX10-NEXT:    buffer_gl1_inv
12458; GFX10-NEXT:    buffer_gl0_inv
12459; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
12460; GFX10-NEXT:    v_mov_b32_e32 v3, v2
12461; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
12462; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
12463; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
12464; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
12465; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
12466; GFX10-NEXT:    s_setpc_b64 s[30:31]
12467;
12468; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
12469; GFX90A:       ; %bb.0:
12470; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12471; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
12472; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
12473; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
12474; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
12475; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
12476; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12477; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
12478; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
12479; GFX90A-NEXT:    v_not_b32_e32 v5, v5
12480; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
12481; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12482; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
12483; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
12484; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
12485; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12486; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12487; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
12488; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
12489; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12490; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
12491; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12492; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
12493; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12494; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
12495; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12496; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12497; GFX90A-NEXT:    buffer_wbinvl1
12498; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12499; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12500; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
12501; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12502; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
12503; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
12504; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
12505; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12506;
12507; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
12508; GFX908:       ; %bb.0:
12509; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12510; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
12511; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
12512; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
12513; GFX908-NEXT:    flat_load_dword v3, v[0:1]
12514; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
12515; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12516; GFX908-NEXT:    s_mov_b32 s4, 0xffff
12517; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
12518; GFX908-NEXT:    v_not_b32_e32 v5, v5
12519; GFX908-NEXT:    s_mov_b64 s[4:5], 0
12520; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12521; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
12522; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
12523; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
12524; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12525; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12526; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
12527; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
12528; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12529; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
12530; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12531; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
12532; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12533; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
12534; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12535; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12536; GFX908-NEXT:    buffer_wbinvl1
12537; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12538; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12539; GFX908-NEXT:    v_mov_b32_e32 v3, v2
12540; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12541; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
12542; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
12543; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
12544; GFX908-NEXT:    s_setpc_b64 s[30:31]
12545;
12546; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
12547; GFX8:       ; %bb.0:
12548; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12549; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
12550; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
12551; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
12552; GFX8-NEXT:    flat_load_dword v3, v[0:1]
12553; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
12554; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12555; GFX8-NEXT:    s_mov_b32 s4, 0xffff
12556; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
12557; GFX8-NEXT:    v_not_b32_e32 v5, v5
12558; GFX8-NEXT:    s_mov_b64 s[4:5], 0
12559; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12560; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
12561; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
12562; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12563; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12564; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
12565; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
12566; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
12567; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
12568; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
12569; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12570; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
12571; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
12572; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12573; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
12574; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12575; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12576; GFX8-NEXT:    buffer_wbinvl1
12577; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12578; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12579; GFX8-NEXT:    v_mov_b32_e32 v3, v2
12580; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12581; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
12582; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
12583; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
12584; GFX8-NEXT:    s_setpc_b64 s[30:31]
12585;
12586; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
12587; GFX7:       ; %bb.0:
12588; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12589; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
12590; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
12591; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
12592; GFX7-NEXT:    flat_load_dword v3, v[0:1]
12593; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
12594; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12595; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
12596; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
12597; GFX7-NEXT:    v_not_b32_e32 v5, v5
12598; GFX7-NEXT:    s_mov_b64 s[4:5], 0
12599; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
12600; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
12601; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
12602; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12603; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
12604; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
12605; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
12606; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
12607; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
12608; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
12609; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
12610; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12611; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12612; GFX7-NEXT:    buffer_wbinvl1
12613; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12614; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12615; GFX7-NEXT:    v_mov_b32_e32 v3, v2
12616; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12617; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
12618; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
12619; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
12620; GFX7-NEXT:    s_setpc_b64 s[30:31]
12621  %gep = getelementptr bfloat, ptr %ptr, i64 -1024
12622  %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
12623  ret void
12624}
12625
12626define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
12627; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
12628; GFX12:       ; %bb.0:
12629; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12630; GFX12-NEXT:    s_wait_expcnt 0x0
12631; GFX12-NEXT:    s_wait_samplecnt 0x0
12632; GFX12-NEXT:    s_wait_bvhcnt 0x0
12633; GFX12-NEXT:    s_wait_kmcnt 0x0
12634; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
12635; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
12636; GFX12-NEXT:    s_mov_b32 s0, 0
12637; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
12638; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
12639; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12640; GFX12-NEXT:    v_mov_b32_e32 v4, v3
12641; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12642; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
12643; GFX12-NEXT:    v_add_f32_e32 v3, v3, v2
12644; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
12645; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
12646; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v3
12647; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
12648; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
12649; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12650; GFX12-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
12651; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
12652; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12653; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
12654; GFX12-NEXT:    s_wait_storecnt 0x0
12655; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12656; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12657; GFX12-NEXT:    global_inv scope:SCOPE_DEV
12658; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12659; GFX12-NEXT:    s_wait_alu 0xfffe
12660; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
12661; GFX12-NEXT:    s_wait_alu 0xfffe
12662; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12663; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
12664; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
12665; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12666; GFX12-NEXT:    v_mov_b32_e32 v0, v3
12667; GFX12-NEXT:    s_wait_alu 0xfffe
12668; GFX12-NEXT:    s_setpc_b64 s[30:31]
12669;
12670; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
12671; GFX940:       ; %bb.0:
12672; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12673; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
12674; GFX940-NEXT:    s_mov_b64 s[0:1], 0
12675; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
12676; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
12677; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
12678; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
12679; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
12680; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12681; GFX940-NEXT:    v_mov_b32_e32 v5, v3
12682; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
12683; GFX940-NEXT:    v_add_f32_e32 v3, v3, v2
12684; GFX940-NEXT:    v_bfe_u32 v4, v3, 16, 1
12685; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
12686; GFX940-NEXT:    v_add3_u32 v4, v4, v3, s2
12687; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
12688; GFX940-NEXT:    s_nop 1
12689; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
12690; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
12691; GFX940-NEXT:    v_and_or_b32 v4, v5, s3, v3
12692; GFX940-NEXT:    buffer_wbl2 sc1
12693; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
12694; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12695; GFX940-NEXT:    buffer_inv sc1
12696; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
12697; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
12698; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
12699; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
12700; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
12701; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
12702; GFX940-NEXT:    v_mov_b32_e32 v0, v3
12703; GFX940-NEXT:    s_setpc_b64 s[30:31]
12704;
12705; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
12706; GFX11:       ; %bb.0:
12707; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12708; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
12709; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
12710; GFX11-NEXT:    s_mov_b32 s0, 0
12711; GFX11-NEXT:    .p2align 6
12712; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
12713; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
12714; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12715; GFX11-NEXT:    v_mov_b32_e32 v4, v3
12716; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12717; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
12718; GFX11-NEXT:    v_add_f32_e32 v3, v3, v2
12719; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
12720; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
12721; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v3
12722; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
12723; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
12724; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12725; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
12726; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
12727; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12728; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
12729; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
12730; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
12731; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12732; GFX11-NEXT:    buffer_gl1_inv
12733; GFX11-NEXT:    buffer_gl0_inv
12734; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12735; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
12736; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12737; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12738; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
12739; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
12740; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12741; GFX11-NEXT:    v_mov_b32_e32 v0, v3
12742; GFX11-NEXT:    s_setpc_b64 s[30:31]
12743;
12744; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
12745; GFX10:       ; %bb.0:
12746; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12747; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
12748; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
12749; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
12750; GFX10-NEXT:    s_mov_b32 s4, 0
12751; GFX10-NEXT:    flat_load_dword v0, v[3:4]
12752; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
12753; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
12754; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12755; GFX10-NEXT:    v_mov_b32_e32 v6, v0
12756; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
12757; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
12758; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
12759; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
12760; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
12761; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
12762; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc_lo
12763; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
12764; GFX10-NEXT:    v_and_or_b32 v5, 0xffff0000, v6, v0
12765; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
12766; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
12767; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12768; GFX10-NEXT:    buffer_gl1_inv
12769; GFX10-NEXT:    buffer_gl0_inv
12770; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
12771; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
12772; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
12773; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
12774; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
12775; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
12776; GFX10-NEXT:    s_setpc_b64 s[30:31]
12777;
12778; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
12779; GFX90A:       ; %bb.0:
12780; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12781; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
12782; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
12783; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
12784; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
12785; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
12786; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
12787; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
12788; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12789; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
12790; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
12791; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v2
12792; GFX90A-NEXT:    v_bfe_u32 v4, v3, 16, 1
12793; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
12794; GFX90A-NEXT:    v_add3_u32 v4, v4, v3, s6
12795; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
12796; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
12797; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
12798; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v3
12799; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
12800; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12801; GFX90A-NEXT:    buffer_wbinvl1
12802; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
12803; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12804; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12805; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
12806; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
12807; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
12808; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
12809; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12810;
12811; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
12812; GFX908:       ; %bb.0:
12813; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12814; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
12815; GFX908-NEXT:    s_mov_b64 s[4:5], 0
12816; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
12817; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
12818; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
12819; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
12820; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
12821; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12822; GFX908-NEXT:    v_mov_b32_e32 v4, v3
12823; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
12824; GFX908-NEXT:    v_add_f32_e32 v3, v3, v2
12825; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
12826; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
12827; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
12828; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
12829; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
12830; GFX908-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
12831; GFX908-NEXT:    v_and_or_b32 v3, v4, s7, v3
12832; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
12833; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12834; GFX908-NEXT:    buffer_wbinvl1
12835; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
12836; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12837; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12838; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
12839; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
12840; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
12841; GFX908-NEXT:    v_mov_b32_e32 v0, v3
12842; GFX908-NEXT:    s_setpc_b64 s[30:31]
12843;
12844; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
12845; GFX8:       ; %bb.0:
12846; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12847; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
12848; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
12849; GFX8-NEXT:    flat_load_dword v0, v[3:4]
12850; GFX8-NEXT:    s_mov_b64 s[4:5], 0
12851; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
12852; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
12853; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
12854; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12855; GFX8-NEXT:    v_mov_b32_e32 v6, v0
12856; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
12857; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
12858; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
12859; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
12860; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
12861; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
12862; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
12863; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
12864; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
12865; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12866; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
12867; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12868; GFX8-NEXT:    buffer_wbinvl1
12869; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
12870; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12871; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12872; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
12873; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
12874; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
12875; GFX8-NEXT:    s_setpc_b64 s[30:31]
12876;
12877; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
12878; GFX7:       ; %bb.0:
12879; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12880; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
12881; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
12882; GFX7-NEXT:    flat_load_dword v3, v[0:1]
12883; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
12884; GFX7-NEXT:    s_mov_b64 s[4:5], 0
12885; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
12886; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
12887; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
12888; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12889; GFX7-NEXT:    v_mov_b32_e32 v4, v3
12890; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
12891; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
12892; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
12893; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
12894; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
12895; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
12896; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12897; GFX7-NEXT:    buffer_wbinvl1
12898; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
12899; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12900; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12901; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
12902; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
12903; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
12904; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
12905; GFX7-NEXT:    s_setpc_b64 s[30:31]
12906  %gep = getelementptr bfloat, ptr %ptr, i64 1023
12907  %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
12908  ret bfloat %result
12909}
12910
12911define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
12912; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
12913; GFX12:       ; %bb.0:
12914; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12915; GFX12-NEXT:    s_wait_expcnt 0x0
12916; GFX12-NEXT:    s_wait_samplecnt 0x0
12917; GFX12-NEXT:    s_wait_bvhcnt 0x0
12918; GFX12-NEXT:    s_wait_kmcnt 0x0
12919; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
12920; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
12921; GFX12-NEXT:    s_mov_b32 s0, 0
12922; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
12923; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
12924; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12925; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
12926; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12927; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
12928; GFX12-NEXT:    v_bfe_u32 v5, v2, 16, 1
12929; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v2
12930; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
12931; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
12932; GFX12-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
12933; GFX12-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
12934; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12935; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
12936; GFX12-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
12937; GFX12-NEXT:    s_wait_storecnt 0x0
12938; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12939; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12940; GFX12-NEXT:    global_inv scope:SCOPE_DEV
12941; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
12942; GFX12-NEXT:    v_mov_b32_e32 v3, v2
12943; GFX12-NEXT:    s_wait_alu 0xfffe
12944; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
12945; GFX12-NEXT:    s_wait_alu 0xfffe
12946; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12947; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
12948; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
12949; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12950; GFX12-NEXT:    s_wait_alu 0xfffe
12951; GFX12-NEXT:    s_setpc_b64 s[30:31]
12952;
12953; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
12954; GFX940:       ; %bb.0:
12955; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12956; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
12957; GFX940-NEXT:    s_mov_b64 s[0:1], 0
12958; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
12959; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
12960; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
12961; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
12962; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
12963; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12964; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
12965; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
12966; GFX940-NEXT:    v_bfe_u32 v5, v2, 16, 1
12967; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v2
12968; GFX940-NEXT:    v_add3_u32 v5, v5, v2, s2
12969; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12970; GFX940-NEXT:    s_nop 1
12971; GFX940-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
12972; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
12973; GFX940-NEXT:    v_and_or_b32 v2, v3, s3, v2
12974; GFX940-NEXT:    buffer_wbl2 sc1
12975; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
12976; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12977; GFX940-NEXT:    buffer_inv sc1
12978; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12979; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
12980; GFX940-NEXT:    v_mov_b32_e32 v3, v2
12981; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
12982; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
12983; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
12984; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
12985; GFX940-NEXT:    s_setpc_b64 s[30:31]
12986;
12987; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
12988; GFX11:       ; %bb.0:
12989; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12990; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
12991; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
12992; GFX11-NEXT:    s_mov_b32 s0, 0
12993; GFX11-NEXT:    .p2align 6
12994; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
12995; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
12996; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12997; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
12998; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12999; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
13000; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
13001; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
13002; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
13003; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
13004; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
13005; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
13006; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13007; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
13008; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
13009; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
13010; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
13011; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13012; GFX11-NEXT:    buffer_gl1_inv
13013; GFX11-NEXT:    buffer_gl0_inv
13014; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
13015; GFX11-NEXT:    v_mov_b32_e32 v3, v2
13016; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
13017; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13018; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13019; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
13020; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
13021; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13022; GFX11-NEXT:    s_setpc_b64 s[30:31]
13023;
13024; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
13025; GFX10:       ; %bb.0:
13026; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13027; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
13028; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
13029; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13030; GFX10-NEXT:    s_mov_b32 s4, 0
13031; GFX10-NEXT:    flat_load_dword v3, v[0:1]
13032; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
13033; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
13034; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13035; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
13036; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
13037; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
13038; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
13039; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
13040; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
13041; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
13042; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
13043; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
13044; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
13045; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
13046; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13047; GFX10-NEXT:    buffer_gl1_inv
13048; GFX10-NEXT:    buffer_gl0_inv
13049; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
13050; GFX10-NEXT:    v_mov_b32_e32 v3, v2
13051; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
13052; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
13053; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
13054; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
13055; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
13056; GFX10-NEXT:    s_setpc_b64 s[30:31]
13057;
13058; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
13059; GFX90A:       ; %bb.0:
13060; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13061; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
13062; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
13063; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13064; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
13065; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
13066; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
13067; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
13068; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13069; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
13070; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
13071; GFX90A-NEXT:    v_bfe_u32 v5, v2, 16, 1
13072; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v2
13073; GFX90A-NEXT:    v_add3_u32 v5, v5, v2, s6
13074; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
13075; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
13076; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
13077; GFX90A-NEXT:    v_and_or_b32 v2, v3, s7, v2
13078; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
13079; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13080; GFX90A-NEXT:    buffer_wbinvl1
13081; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13082; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13083; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
13084; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13085; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
13086; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
13087; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
13088; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13089;
13090; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
13091; GFX908:       ; %bb.0:
13092; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13093; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
13094; GFX908-NEXT:    s_mov_b64 s[4:5], 0
13095; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13096; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
13097; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
13098; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
13099; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
13100; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13101; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
13102; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
13103; GFX908-NEXT:    v_bfe_u32 v5, v2, 16, 1
13104; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v2
13105; GFX908-NEXT:    v_add3_u32 v5, v5, v2, s6
13106; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
13107; GFX908-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
13108; GFX908-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
13109; GFX908-NEXT:    v_and_or_b32 v2, v3, s7, v2
13110; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
13111; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13112; GFX908-NEXT:    buffer_wbinvl1
13113; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13114; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13115; GFX908-NEXT:    v_mov_b32_e32 v3, v2
13116; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13117; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
13118; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
13119; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
13120; GFX908-NEXT:    s_setpc_b64 s[30:31]
13121;
13122; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
13123; GFX8:       ; %bb.0:
13124; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13125; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
13126; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
13127; GFX8-NEXT:    flat_load_dword v3, v[0:1]
13128; GFX8-NEXT:    s_mov_b64 s[4:5], 0
13129; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
13130; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
13131; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
13132; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13133; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
13134; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
13135; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
13136; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
13137; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
13138; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
13139; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
13140; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
13141; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
13142; GFX8-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13143; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
13144; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13145; GFX8-NEXT:    buffer_wbinvl1
13146; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13147; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13148; GFX8-NEXT:    v_mov_b32_e32 v3, v2
13149; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13150; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
13151; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
13152; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
13153; GFX8-NEXT:    s_setpc_b64 s[30:31]
13154;
13155; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
13156; GFX7:       ; %bb.0:
13157; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13158; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
13159; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
13160; GFX7-NEXT:    flat_load_dword v3, v[0:1]
13161; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
13162; GFX7-NEXT:    s_mov_b64 s[4:5], 0
13163; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
13164; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
13165; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
13166; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13167; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
13168; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
13169; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
13170; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
13171; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
13172; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
13173; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13174; GFX7-NEXT:    buffer_wbinvl1
13175; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13176; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13177; GFX7-NEXT:    v_mov_b32_e32 v3, v2
13178; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13179; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
13180; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
13181; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
13182; GFX7-NEXT:    s_setpc_b64 s[30:31]
13183  %gep = getelementptr bfloat, ptr %ptr, i64 1023
13184  %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
13185  ret void
13186}
13187
13188define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
13189; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
13190; GFX12:       ; %bb.0:
13191; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13192; GFX12-NEXT:    s_wait_expcnt 0x0
13193; GFX12-NEXT:    s_wait_samplecnt 0x0
13194; GFX12-NEXT:    s_wait_bvhcnt 0x0
13195; GFX12-NEXT:    s_wait_kmcnt 0x0
13196; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
13197; GFX12-NEXT:    s_mov_b32 s0, 0
13198; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
13199; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
13200; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
13201; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
13202; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
13203; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
13204; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13205; GFX12-NEXT:    v_not_b32_e32 v6, v3
13206; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
13207; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
13208; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13209; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
13210; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13211; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13212; GFX12-NEXT:    v_add_f32_e32 v3, v3, v2
13213; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
13214; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
13215; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
13216; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
13217; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
13218; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13219; GFX12-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
13220; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
13221; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13222; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
13223; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
13224; GFX12-NEXT:    s_wait_storecnt 0x0
13225; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13226; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13227; GFX12-NEXT:    global_inv scope:SCOPE_DEV
13228; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
13229; GFX12-NEXT:    v_mov_b32_e32 v4, v3
13230; GFX12-NEXT:    s_wait_alu 0xfffe
13231; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
13232; GFX12-NEXT:    s_wait_alu 0xfffe
13233; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13234; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
13235; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
13236; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13237; GFX12-NEXT:    s_wait_alu 0xfffe
13238; GFX12-NEXT:    s_setpc_b64 s[30:31]
13239;
13240; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
13241; GFX940:       ; %bb.0:
13242; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13243; GFX940-NEXT:    v_mov_b32_e32 v3, v0
13244; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
13245; GFX940-NEXT:    flat_load_dword v5, v[0:1]
13246; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
13247; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
13248; GFX940-NEXT:    s_mov_b32 s0, 0xffff
13249; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
13250; GFX940-NEXT:    v_not_b32_e32 v6, v4
13251; GFX940-NEXT:    s_mov_b64 s[0:1], 0
13252; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13253; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
13254; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
13255; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
13256; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13257; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13258; GFX940-NEXT:    s_nop 0
13259; GFX940-NEXT:    v_add_f32_e32 v4, v4, v2
13260; GFX940-NEXT:    v_bfe_u32 v7, v4, 16, 1
13261; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
13262; GFX940-NEXT:    v_add3_u32 v7, v7, v4, s2
13263; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
13264; GFX940-NEXT:    s_nop 1
13265; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
13266; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13267; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
13268; GFX940-NEXT:    buffer_wbl2 sc1
13269; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
13270; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13271; GFX940-NEXT:    buffer_inv sc1
13272; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
13273; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
13274; GFX940-NEXT:    v_mov_b32_e32 v5, v4
13275; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
13276; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
13277; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
13278; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
13279; GFX940-NEXT:    s_setpc_b64 s[30:31]
13280;
13281; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
13282; GFX11:       ; %bb.0:
13283; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13284; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
13285; GFX11-NEXT:    s_mov_b32 s0, 0
13286; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
13287; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
13288; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
13289; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
13290; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
13291; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
13292; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13293; GFX11-NEXT:    v_not_b32_e32 v6, v3
13294; GFX11-NEXT:    .p2align 6
13295; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
13296; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
13297; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13298; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
13299; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13300; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13301; GFX11-NEXT:    v_add_f32_e32 v3, v3, v2
13302; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
13303; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
13304; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
13305; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
13306; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
13307; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13308; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
13309; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
13310; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13311; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
13312; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
13313; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
13314; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
13315; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13316; GFX11-NEXT:    buffer_gl1_inv
13317; GFX11-NEXT:    buffer_gl0_inv
13318; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
13319; GFX11-NEXT:    v_mov_b32_e32 v4, v3
13320; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
13321; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13322; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13323; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
13324; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
13325; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13326; GFX11-NEXT:    s_setpc_b64 s[30:31]
13327;
13328; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
13329; GFX10:       ; %bb.0:
13330; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13331; GFX10-NEXT:    v_mov_b32_e32 v3, v0
13332; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13333; GFX10-NEXT:    s_mov_b32 s4, 0
13334; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
13335; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
13336; GFX10-NEXT:    flat_load_dword v4, v[0:1]
13337; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
13338; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
13339; GFX10-NEXT:    v_not_b32_e32 v6, v3
13340; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
13341; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
13342; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13343; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13344; GFX10-NEXT:    v_add_f32_e32 v3, v3, v2
13345; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
13346; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
13347; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
13348; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
13349; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
13350; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13351; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
13352; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
13353; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
13354; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13355; GFX10-NEXT:    buffer_gl1_inv
13356; GFX10-NEXT:    buffer_gl0_inv
13357; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
13358; GFX10-NEXT:    v_mov_b32_e32 v4, v3
13359; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
13360; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
13361; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
13362; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
13363; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
13364; GFX10-NEXT:    s_setpc_b64 s[30:31]
13365;
13366; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
13367; GFX90A:       ; %bb.0:
13368; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13369; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
13370; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
13371; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
13372; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
13373; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
13374; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
13375; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
13376; GFX90A-NEXT:    v_not_b32_e32 v6, v4
13377; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
13378; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13379; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
13380; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
13381; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
13382; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13383; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13384; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v2
13385; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
13386; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
13387; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s6
13388; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
13389; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
13390; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13391; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
13392; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
13393; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13394; GFX90A-NEXT:    buffer_wbinvl1
13395; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
13396; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13397; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
13398; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13399; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
13400; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
13401; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
13402; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13403;
13404; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
13405; GFX908:       ; %bb.0:
13406; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13407; GFX908-NEXT:    v_mov_b32_e32 v3, v0
13408; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
13409; GFX908-NEXT:    flat_load_dword v4, v[0:1]
13410; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
13411; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
13412; GFX908-NEXT:    s_mov_b32 s4, 0xffff
13413; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
13414; GFX908-NEXT:    v_not_b32_e32 v6, v3
13415; GFX908-NEXT:    s_mov_b64 s[4:5], 0
13416; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13417; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
13418; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
13419; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
13420; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13421; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13422; GFX908-NEXT:    v_add_f32_e32 v3, v3, v2
13423; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
13424; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
13425; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s6
13426; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
13427; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
13428; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13429; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
13430; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
13431; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13432; GFX908-NEXT:    buffer_wbinvl1
13433; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
13434; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13435; GFX908-NEXT:    v_mov_b32_e32 v4, v3
13436; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13437; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
13438; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
13439; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
13440; GFX908-NEXT:    s_setpc_b64 s[30:31]
13441;
13442; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
13443; GFX8:       ; %bb.0:
13444; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13445; GFX8-NEXT:    v_mov_b32_e32 v3, v0
13446; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
13447; GFX8-NEXT:    flat_load_dword v4, v[0:1]
13448; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
13449; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
13450; GFX8-NEXT:    s_mov_b32 s4, 0xffff
13451; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
13452; GFX8-NEXT:    v_not_b32_e32 v6, v3
13453; GFX8-NEXT:    s_mov_b64 s[4:5], 0
13454; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13455; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
13456; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
13457; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13458; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13459; GFX8-NEXT:    v_add_f32_e32 v3, v3, v2
13460; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
13461; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
13462; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
13463; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
13464; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
13465; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
13466; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
13467; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13468; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
13469; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
13470; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13471; GFX8-NEXT:    buffer_wbinvl1
13472; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
13473; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13474; GFX8-NEXT:    v_mov_b32_e32 v4, v3
13475; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13476; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
13477; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
13478; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
13479; GFX8-NEXT:    s_setpc_b64 s[30:31]
13480;
13481; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
13482; GFX7:       ; %bb.0:
13483; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13484; GFX7-NEXT:    v_mov_b32_e32 v3, v0
13485; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
13486; GFX7-NEXT:    flat_load_dword v4, v[0:1]
13487; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
13488; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
13489; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
13490; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
13491; GFX7-NEXT:    v_not_b32_e32 v6, v3
13492; GFX7-NEXT:    s_mov_b64 s[4:5], 0
13493; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13494; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
13495; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
13496; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13497; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
13498; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
13499; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
13500; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
13501; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
13502; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
13503; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
13504; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
13505; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13506; GFX7-NEXT:    buffer_wbinvl1
13507; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
13508; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13509; GFX7-NEXT:    v_mov_b32_e32 v4, v3
13510; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13511; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
13512; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
13513; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
13514; GFX7-NEXT:    s_setpc_b64 s[30:31]
13515  %unused = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
13516  ret void
13517}
13518
13519define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
13520; GFX12-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
13521; GFX12:       ; %bb.0:
13522; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13523; GFX12-NEXT:    s_wait_expcnt 0x0
13524; GFX12-NEXT:    s_wait_samplecnt 0x0
13525; GFX12-NEXT:    s_wait_bvhcnt 0x0
13526; GFX12-NEXT:    s_wait_kmcnt 0x0
13527; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
13528; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
13529; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13530; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
13531; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
13532; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
13533; GFX12-NEXT:    s_mov_b32 s0, 0
13534; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
13535; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
13536; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
13537; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13538; GFX12-NEXT:    v_not_b32_e32 v4, v4
13539; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
13540; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
13541; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13542; GFX12-NEXT:    v_mov_b32_e32 v6, v5
13543; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13544; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
13545; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
13546; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13547; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
13548; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
13549; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
13550; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
13551; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
13552; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
13553; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
13554; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13555; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
13556; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
13557; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13558; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
13559; GFX12-NEXT:    global_wb scope:SCOPE_SYS
13560; GFX12-NEXT:    s_wait_storecnt 0x0
13561; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13562; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13563; GFX12-NEXT:    global_inv scope:SCOPE_SYS
13564; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
13565; GFX12-NEXT:    s_wait_alu 0xfffe
13566; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
13567; GFX12-NEXT:    s_wait_alu 0xfffe
13568; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13569; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
13570; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
13571; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13572; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
13573; GFX12-NEXT:    s_wait_alu 0xfffe
13574; GFX12-NEXT:    s_setpc_b64 s[30:31]
13575;
13576; GFX940-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
13577; GFX940:       ; %bb.0:
13578; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13579; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
13580; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
13581; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
13582; GFX940-NEXT:    v_mov_b32_e32 v1, v5
13583; GFX940-NEXT:    flat_load_dword v5, v[0:1]
13584; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
13585; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
13586; GFX940-NEXT:    s_mov_b32 s0, 0xffff
13587; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
13588; GFX940-NEXT:    v_not_b32_e32 v4, v4
13589; GFX940-NEXT:    s_mov_b64 s[0:1], 0
13590; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13591; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
13592; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
13593; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
13594; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13595; GFX940-NEXT:    v_mov_b32_e32 v7, v5
13596; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13597; GFX940-NEXT:    s_nop 0
13598; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
13599; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
13600; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
13601; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
13602; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
13603; GFX940-NEXT:    s_nop 1
13604; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
13605; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13606; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
13607; GFX940-NEXT:    buffer_wbl2 sc0 sc1
13608; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
13609; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13610; GFX940-NEXT:    buffer_inv sc0 sc1
13611; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
13612; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
13613; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
13614; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
13615; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
13616; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
13617; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
13618; GFX940-NEXT:    s_setpc_b64 s[30:31]
13619;
13620; GFX11-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
13621; GFX11:       ; %bb.0:
13622; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13623; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
13624; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
13625; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13626; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
13627; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
13628; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
13629; GFX11-NEXT:    s_mov_b32 s0, 0
13630; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
13631; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
13632; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
13633; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13634; GFX11-NEXT:    v_not_b32_e32 v4, v4
13635; GFX11-NEXT:    .p2align 6
13636; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
13637; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
13638; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13639; GFX11-NEXT:    v_mov_b32_e32 v6, v5
13640; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13641; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
13642; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
13643; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13644; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
13645; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
13646; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
13647; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
13648; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
13649; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
13650; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
13651; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13652; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
13653; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
13654; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13655; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
13656; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
13657; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
13658; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13659; GFX11-NEXT:    buffer_gl1_inv
13660; GFX11-NEXT:    buffer_gl0_inv
13661; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
13662; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
13663; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13664; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13665; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
13666; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
13667; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13668; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
13669; GFX11-NEXT:    s_setpc_b64 s[30:31]
13670;
13671; GFX10-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
13672; GFX10:       ; %bb.0:
13673; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13674; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
13675; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
13676; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13677; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
13678; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
13679; GFX10-NEXT:    s_mov_b32 s4, 0
13680; GFX10-NEXT:    flat_load_dword v5, v[0:1]
13681; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
13682; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
13683; GFX10-NEXT:    v_not_b32_e32 v4, v4
13684; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
13685; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
13686; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13687; GFX10-NEXT:    v_mov_b32_e32 v6, v5
13688; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13689; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
13690; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
13691; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
13692; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
13693; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
13694; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
13695; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13696; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
13697; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
13698; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
13699; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13700; GFX10-NEXT:    buffer_gl1_inv
13701; GFX10-NEXT:    buffer_gl0_inv
13702; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
13703; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
13704; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
13705; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
13706; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
13707; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
13708; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
13709; GFX10-NEXT:    s_setpc_b64 s[30:31]
13710;
13711; GFX90A-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
13712; GFX90A:       ; %bb.0:
13713; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13714; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
13715; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
13716; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
13717; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
13718; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
13719; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
13720; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
13721; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
13722; GFX90A-NEXT:    v_not_b32_e32 v4, v4
13723; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
13724; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13725; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
13726; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
13727; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
13728; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13729; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
13730; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13731; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
13732; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
13733; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
13734; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
13735; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
13736; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
13737; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13738; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
13739; GFX90A-NEXT:    buffer_wbl2
13740; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
13741; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13742; GFX90A-NEXT:    buffer_invl2
13743; GFX90A-NEXT:    buffer_wbinvl1
13744; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
13745; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13746; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13747; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
13748; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
13749; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
13750; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
13751; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13752;
13753; GFX908-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
13754; GFX908:       ; %bb.0:
13755; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13756; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
13757; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
13758; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
13759; GFX908-NEXT:    flat_load_dword v5, v[0:1]
13760; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
13761; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
13762; GFX908-NEXT:    s_mov_b32 s4, 0xffff
13763; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
13764; GFX908-NEXT:    v_not_b32_e32 v4, v4
13765; GFX908-NEXT:    s_mov_b64 s[4:5], 0
13766; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13767; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
13768; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
13769; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
13770; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13771; GFX908-NEXT:    v_mov_b32_e32 v6, v5
13772; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13773; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
13774; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
13775; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
13776; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
13777; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
13778; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
13779; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13780; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
13781; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
13782; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13783; GFX908-NEXT:    buffer_wbinvl1
13784; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
13785; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13786; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13787; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
13788; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
13789; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
13790; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
13791; GFX908-NEXT:    s_setpc_b64 s[30:31]
13792;
13793; GFX8-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
13794; GFX8:       ; %bb.0:
13795; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13796; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
13797; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
13798; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
13799; GFX8-NEXT:    flat_load_dword v5, v[0:1]
13800; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
13801; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
13802; GFX8-NEXT:    s_mov_b32 s4, 0xffff
13803; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
13804; GFX8-NEXT:    v_not_b32_e32 v4, v4
13805; GFX8-NEXT:    s_mov_b64 s[4:5], 0
13806; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13807; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
13808; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
13809; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13810; GFX8-NEXT:    v_mov_b32_e32 v6, v5
13811; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13812; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
13813; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
13814; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
13815; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
13816; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
13817; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
13818; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
13819; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
13820; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13821; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
13822; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
13823; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13824; GFX8-NEXT:    buffer_wbinvl1
13825; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
13826; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13827; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13828; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
13829; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
13830; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
13831; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
13832; GFX8-NEXT:    s_setpc_b64 s[30:31]
13833;
13834; GFX7-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
13835; GFX7:       ; %bb.0:
13836; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13837; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
13838; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
13839; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
13840; GFX7-NEXT:    flat_load_dword v5, v[0:1]
13841; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
13842; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
13843; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
13844; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
13845; GFX7-NEXT:    v_not_b32_e32 v4, v4
13846; GFX7-NEXT:    s_mov_b64 s[4:5], 0
13847; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
13848; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
13849; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
13850; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13851; GFX7-NEXT:    v_mov_b32_e32 v6, v5
13852; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
13853; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
13854; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
13855; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
13856; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
13857; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
13858; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
13859; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
13860; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13861; GFX7-NEXT:    buffer_wbinvl1
13862; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
13863; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13864; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13865; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
13866; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
13867; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
13868; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
13869; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
13870; GFX7-NEXT:    s_setpc_b64 s[30:31]
13871  %gep = getelementptr bfloat, ptr %ptr, i64 1023
13872  %result = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
13873  ret bfloat %result
13874}
13875
13876define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
13877; GFX12-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
13878; GFX12:       ; %bb.0:
13879; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13880; GFX12-NEXT:    s_wait_expcnt 0x0
13881; GFX12-NEXT:    s_wait_samplecnt 0x0
13882; GFX12-NEXT:    s_wait_bvhcnt 0x0
13883; GFX12-NEXT:    s_wait_kmcnt 0x0
13884; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
13885; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
13886; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
13887; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
13888; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
13889; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
13890; GFX12-NEXT:    s_mov_b32 s0, 0
13891; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
13892; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
13893; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
13894; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13895; GFX12-NEXT:    v_not_b32_e32 v5, v5
13896; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
13897; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
13898; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13899; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
13900; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13901; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13902; GFX12-NEXT:    v_add_f32_e32 v2, v2, v6
13903; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
13904; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
13905; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
13906; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
13907; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
13908; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13909; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
13910; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
13911; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13912; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
13913; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
13914; GFX12-NEXT:    global_wb scope:SCOPE_SYS
13915; GFX12-NEXT:    s_wait_storecnt 0x0
13916; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13917; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13918; GFX12-NEXT:    global_inv scope:SCOPE_SYS
13919; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
13920; GFX12-NEXT:    v_mov_b32_e32 v3, v2
13921; GFX12-NEXT:    s_wait_alu 0xfffe
13922; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
13923; GFX12-NEXT:    s_wait_alu 0xfffe
13924; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13925; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
13926; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
13927; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13928; GFX12-NEXT:    s_wait_alu 0xfffe
13929; GFX12-NEXT:    s_setpc_b64 s[30:31]
13930;
13931; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
13932; GFX940:       ; %bb.0:
13933; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13934; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
13935; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
13936; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
13937; GFX940-NEXT:    v_mov_b32_e32 v1, v5
13938; GFX940-NEXT:    flat_load_dword v3, v[0:1]
13939; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
13940; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
13941; GFX940-NEXT:    s_mov_b32 s0, 0xffff
13942; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
13943; GFX940-NEXT:    v_not_b32_e32 v5, v5
13944; GFX940-NEXT:    s_mov_b64 s[0:1], 0
13945; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
13946; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
13947; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
13948; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
13949; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13950; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13951; GFX940-NEXT:    s_nop 0
13952; GFX940-NEXT:    v_add_f32_e32 v2, v2, v6
13953; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
13954; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
13955; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
13956; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
13957; GFX940-NEXT:    s_nop 1
13958; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
13959; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
13960; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
13961; GFX940-NEXT:    buffer_wbl2 sc0 sc1
13962; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
13963; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13964; GFX940-NEXT:    buffer_inv sc0 sc1
13965; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13966; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
13967; GFX940-NEXT:    v_mov_b32_e32 v3, v2
13968; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
13969; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
13970; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
13971; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
13972; GFX940-NEXT:    s_setpc_b64 s[30:31]
13973;
13974; GFX11-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
13975; GFX11:       ; %bb.0:
13976; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13977; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
13978; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
13979; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
13980; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
13981; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
13982; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
13983; GFX11-NEXT:    s_mov_b32 s0, 0
13984; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
13985; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
13986; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
13987; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13988; GFX11-NEXT:    v_not_b32_e32 v5, v5
13989; GFX11-NEXT:    .p2align 6
13990; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
13991; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
13992; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13993; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
13994; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13995; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
13996; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
13997; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
13998; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
13999; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
14000; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
14001; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
14002; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14003; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
14004; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
14005; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14006; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
14007; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
14008; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
14009; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
14010; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14011; GFX11-NEXT:    buffer_gl1_inv
14012; GFX11-NEXT:    buffer_gl0_inv
14013; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
14014; GFX11-NEXT:    v_mov_b32_e32 v3, v2
14015; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
14016; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
14017; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
14018; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
14019; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
14020; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
14021; GFX11-NEXT:    s_setpc_b64 s[30:31]
14022;
14023; GFX10-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
14024; GFX10:       ; %bb.0:
14025; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14026; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
14027; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
14028; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
14029; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
14030; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
14031; GFX10-NEXT:    s_mov_b32 s4, 0
14032; GFX10-NEXT:    flat_load_dword v3, v[0:1]
14033; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
14034; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
14035; GFX10-NEXT:    v_not_b32_e32 v5, v5
14036; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
14037; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
14038; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14039; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
14040; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
14041; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
14042; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
14043; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
14044; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
14045; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
14046; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
14047; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
14048; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
14049; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
14050; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14051; GFX10-NEXT:    buffer_gl1_inv
14052; GFX10-NEXT:    buffer_gl0_inv
14053; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
14054; GFX10-NEXT:    v_mov_b32_e32 v3, v2
14055; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
14056; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
14057; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
14058; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
14059; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
14060; GFX10-NEXT:    s_setpc_b64 s[30:31]
14061;
14062; GFX90A-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
14063; GFX90A:       ; %bb.0:
14064; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14065; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
14066; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
14067; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
14068; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
14069; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
14070; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
14071; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
14072; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
14073; GFX90A-NEXT:    v_not_b32_e32 v5, v5
14074; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
14075; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
14076; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
14077; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
14078; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
14079; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14080; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
14081; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v6
14082; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
14083; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
14084; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
14085; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
14086; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
14087; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
14088; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
14089; GFX90A-NEXT:    buffer_wbl2
14090; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
14091; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14092; GFX90A-NEXT:    buffer_invl2
14093; GFX90A-NEXT:    buffer_wbinvl1
14094; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
14095; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14096; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
14097; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14098; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
14099; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
14100; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
14101; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14102;
14103; GFX908-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
14104; GFX908:       ; %bb.0:
14105; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14106; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
14107; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
14108; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
14109; GFX908-NEXT:    flat_load_dword v3, v[0:1]
14110; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
14111; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
14112; GFX908-NEXT:    s_mov_b32 s4, 0xffff
14113; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
14114; GFX908-NEXT:    v_not_b32_e32 v5, v5
14115; GFX908-NEXT:    s_mov_b64 s[4:5], 0
14116; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
14117; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
14118; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
14119; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
14120; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14121; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
14122; GFX908-NEXT:    v_add_f32_e32 v2, v2, v6
14123; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
14124; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
14125; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
14126; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
14127; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
14128; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
14129; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
14130; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
14131; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14132; GFX908-NEXT:    buffer_wbinvl1
14133; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
14134; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14135; GFX908-NEXT:    v_mov_b32_e32 v3, v2
14136; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14137; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
14138; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
14139; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
14140; GFX908-NEXT:    s_setpc_b64 s[30:31]
14141;
14142; GFX8-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
14143; GFX8:       ; %bb.0:
14144; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14145; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
14146; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
14147; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
14148; GFX8-NEXT:    flat_load_dword v3, v[0:1]
14149; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
14150; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
14151; GFX8-NEXT:    s_mov_b32 s4, 0xffff
14152; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
14153; GFX8-NEXT:    v_not_b32_e32 v5, v5
14154; GFX8-NEXT:    s_mov_b64 s[4:5], 0
14155; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
14156; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
14157; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
14158; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14159; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
14160; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
14161; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
14162; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
14163; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
14164; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
14165; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
14166; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
14167; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
14168; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
14169; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
14170; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
14171; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14172; GFX8-NEXT:    buffer_wbinvl1
14173; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
14174; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14175; GFX8-NEXT:    v_mov_b32_e32 v3, v2
14176; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14177; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
14178; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
14179; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
14180; GFX8-NEXT:    s_setpc_b64 s[30:31]
14181;
14182; GFX7-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
14183; GFX7:       ; %bb.0:
14184; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14185; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
14186; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
14187; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
14188; GFX7-NEXT:    flat_load_dword v3, v[0:1]
14189; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
14190; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
14191; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
14192; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
14193; GFX7-NEXT:    v_not_b32_e32 v5, v5
14194; GFX7-NEXT:    s_mov_b64 s[4:5], 0
14195; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
14196; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
14197; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
14198; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14199; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
14200; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
14201; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
14202; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
14203; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
14204; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
14205; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
14206; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
14207; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14208; GFX7-NEXT:    buffer_wbinvl1
14209; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
14210; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14211; GFX7-NEXT:    v_mov_b32_e32 v3, v2
14212; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14213; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
14214; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
14215; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
14216; GFX7-NEXT:    s_setpc_b64 s[30:31]
14217  %gep = getelementptr bfloat, ptr %ptr, i64 1023
14218  %unused = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
14219  ret void
14220}
14221
14222; --------------------------------------------------------------------
14223; <2 x half>
14224; --------------------------------------------------------------------
14225
14226define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
14227; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
14228; GFX12:       ; %bb.0:
14229; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14230; GFX12-NEXT:    s_wait_expcnt 0x0
14231; GFX12-NEXT:    s_wait_samplecnt 0x0
14232; GFX12-NEXT:    s_wait_bvhcnt 0x0
14233; GFX12-NEXT:    s_wait_kmcnt 0x0
14234; GFX12-NEXT:    s_wait_storecnt 0x0
14235; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
14236; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14237; GFX12-NEXT:    global_inv scope:SCOPE_DEV
14238; GFX12-NEXT:    s_setpc_b64 s[30:31]
14239;
14240; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
14241; GFX940:       ; %bb.0:
14242; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14243; GFX940-NEXT:    buffer_wbl2 sc1
14244; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
14245; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14246; GFX940-NEXT:    buffer_inv sc1
14247; GFX940-NEXT:    s_setpc_b64 s[30:31]
14248;
14249; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
14250; GFX11:       ; %bb.0:
14251; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14252; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
14253; GFX11-NEXT:    s_mov_b32 s0, 0
14254; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
14255; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
14256; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14257; GFX11-NEXT:    v_mov_b32_e32 v4, v3
14258; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14259; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
14260; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
14261; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
14262; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14263; GFX11-NEXT:    buffer_gl1_inv
14264; GFX11-NEXT:    buffer_gl0_inv
14265; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
14266; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
14267; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
14268; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
14269; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
14270; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
14271; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
14272; GFX11-NEXT:    v_mov_b32_e32 v0, v3
14273; GFX11-NEXT:    s_setpc_b64 s[30:31]
14274;
14275; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
14276; GFX10:       ; %bb.0:
14277; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14278; GFX10-NEXT:    flat_load_dword v3, v[0:1]
14279; GFX10-NEXT:    s_mov_b32 s4, 0
14280; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
14281; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
14282; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14283; GFX10-NEXT:    v_mov_b32_e32 v4, v3
14284; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
14285; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
14286; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
14287; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14288; GFX10-NEXT:    buffer_gl1_inv
14289; GFX10-NEXT:    buffer_gl0_inv
14290; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
14291; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
14292; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
14293; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
14294; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
14295; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
14296; GFX10-NEXT:    v_mov_b32_e32 v0, v3
14297; GFX10-NEXT:    s_setpc_b64 s[30:31]
14298;
14299; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
14300; GFX90A:       ; %bb.0:
14301; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14302; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
14303; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
14304; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
14305; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
14306; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14307; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
14308; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
14309; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
14310; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14311; GFX90A-NEXT:    buffer_wbinvl1
14312; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
14313; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14314; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14315; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
14316; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
14317; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
14318; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
14319; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14320;
14321; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
14322; GFX908:       ; %bb.0:
14323; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14324; GFX908-NEXT:    flat_load_dword v3, v[0:1]
14325; GFX908-NEXT:    s_mov_b64 s[4:5], 0
14326; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
14327; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
14328; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14329; GFX908-NEXT:    v_mov_b32_e32 v4, v3
14330; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
14331; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
14332; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14333; GFX908-NEXT:    buffer_wbinvl1
14334; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
14335; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14336; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14337; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
14338; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
14339; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
14340; GFX908-NEXT:    v_mov_b32_e32 v0, v3
14341; GFX908-NEXT:    s_setpc_b64 s[30:31]
14342;
14343; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
14344; GFX8:       ; %bb.0:
14345; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14346; GFX8-NEXT:    flat_load_dword v3, v[0:1]
14347; GFX8-NEXT:    s_mov_b64 s[4:5], 0
14348; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
14349; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
14350; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14351; GFX8-NEXT:    v_mov_b32_e32 v4, v3
14352; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
14353; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
14354; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
14355; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
14356; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
14357; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
14358; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14359; GFX8-NEXT:    buffer_wbinvl1
14360; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
14361; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14362; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14363; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
14364; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
14365; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
14366; GFX8-NEXT:    v_mov_b32_e32 v0, v3
14367; GFX8-NEXT:    s_setpc_b64 s[30:31]
14368;
14369; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
14370; GFX7:       ; %bb.0:
14371; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14372; GFX7-NEXT:    flat_load_dword v5, v[0:1]
14373; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
14374; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
14375; GFX7-NEXT:    s_mov_b64 s[4:5], 0
14376; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
14377; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14378; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
14379; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
14380; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
14381; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
14382; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
14383; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
14384; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
14385; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
14386; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
14387; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
14388; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
14389; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
14390; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
14391; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
14392; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
14393; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
14394; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
14395; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
14396; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[6:7] glc
14397; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14398; GFX7-NEXT:    buffer_wbinvl1
14399; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
14400; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v6
14401; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
14402; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
14403; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14404; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14405; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
14406; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
14407; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
14408; GFX7-NEXT:    v_mov_b32_e32 v0, v2
14409; GFX7-NEXT:    v_mov_b32_e32 v1, v3
14410; GFX7-NEXT:    s_setpc_b64 s[30:31]
14411  %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
14412  ret <2 x half> %result
14413}
14414
14415define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
14416; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14417; GFX12:       ; %bb.0:
14418; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14419; GFX12-NEXT:    s_wait_expcnt 0x0
14420; GFX12-NEXT:    s_wait_samplecnt 0x0
14421; GFX12-NEXT:    s_wait_bvhcnt 0x0
14422; GFX12-NEXT:    s_wait_kmcnt 0x0
14423; GFX12-NEXT:    s_wait_storecnt 0x0
14424; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
14425; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14426; GFX12-NEXT:    global_inv scope:SCOPE_DEV
14427; GFX12-NEXT:    s_setpc_b64 s[30:31]
14428;
14429; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14430; GFX940:       ; %bb.0:
14431; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14432; GFX940-NEXT:    buffer_wbl2 sc1
14433; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0
14434; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14435; GFX940-NEXT:    buffer_inv sc1
14436; GFX940-NEXT:    s_setpc_b64 s[30:31]
14437;
14438; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14439; GFX11:       ; %bb.0:
14440; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14441; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
14442; GFX11-NEXT:    s_mov_b32 s0, 0
14443; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
14444; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
14445; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14446; GFX11-NEXT:    v_mov_b32_e32 v4, v3
14447; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14448; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
14449; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
14450; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
14451; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14452; GFX11-NEXT:    buffer_gl1_inv
14453; GFX11-NEXT:    buffer_gl0_inv
14454; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
14455; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
14456; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
14457; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
14458; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
14459; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
14460; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
14461; GFX11-NEXT:    v_mov_b32_e32 v0, v3
14462; GFX11-NEXT:    s_setpc_b64 s[30:31]
14463;
14464; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14465; GFX10:       ; %bb.0:
14466; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14467; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
14468; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
14469; GFX10-NEXT:    s_mov_b32 s4, 0
14470; GFX10-NEXT:    flat_load_dword v0, v[3:4]
14471; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
14472; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
14473; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14474; GFX10-NEXT:    v_mov_b32_e32 v1, v0
14475; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2
14476; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
14477; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
14478; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14479; GFX10-NEXT:    buffer_gl1_inv
14480; GFX10-NEXT:    buffer_gl0_inv
14481; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
14482; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
14483; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
14484; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
14485; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
14486; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
14487; GFX10-NEXT:    s_setpc_b64 s[30:31]
14488;
14489; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14490; GFX90A:       ; %bb.0:
14491; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14492; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
14493; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
14494; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
14495; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
14496; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14497; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
14498; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
14499; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
14500; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14501; GFX90A-NEXT:    buffer_wbinvl1
14502; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
14503; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14504; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14505; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
14506; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
14507; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
14508; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
14509; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14510;
14511; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14512; GFX908:       ; %bb.0:
14513; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14514; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
14515; GFX908-NEXT:    s_mov_b64 s[4:5], 0
14516; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
14517; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
14518; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14519; GFX908-NEXT:    v_mov_b32_e32 v4, v3
14520; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
14521; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
14522; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14523; GFX908-NEXT:    buffer_wbinvl1
14524; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
14525; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14526; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14527; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
14528; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
14529; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
14530; GFX908-NEXT:    v_mov_b32_e32 v0, v3
14531; GFX908-NEXT:    s_setpc_b64 s[30:31]
14532;
14533; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14534; GFX8:       ; %bb.0:
14535; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14536; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
14537; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
14538; GFX8-NEXT:    flat_load_dword v0, v[3:4]
14539; GFX8-NEXT:    s_mov_b64 s[4:5], 0
14540; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
14541; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
14542; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14543; GFX8-NEXT:    v_mov_b32_e32 v1, v0
14544; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
14545; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
14546; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
14547; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
14548; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
14549; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
14550; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14551; GFX8-NEXT:    buffer_wbinvl1
14552; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
14553; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14554; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14555; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
14556; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
14557; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
14558; GFX8-NEXT:    s_setpc_b64 s[30:31]
14559;
14560; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14561; GFX7:       ; %bb.0:
14562; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14563; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
14564; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
14565; GFX7-NEXT:    flat_load_dword v1, v[4:5]
14566; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
14567; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
14568; GFX7-NEXT:    s_mov_b64 s[4:5], 0
14569; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
14570; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
14571; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14572; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
14573; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
14574; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
14575; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
14576; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
14577; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
14578; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
14579; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
14580; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
14581; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
14582; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
14583; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
14584; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
14585; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
14586; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
14587; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
14588; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
14589; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
14590; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14591; GFX7-NEXT:    buffer_wbinvl1
14592; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
14593; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
14594; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
14595; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
14596; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14597; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14598; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
14599; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
14600; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
14601; GFX7-NEXT:    s_setpc_b64 s[30:31]
14602  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
14603  %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
14604  ret <2 x half> %result
14605}
14606
14607define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
14608; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
14609; GFX12:       ; %bb.0:
14610; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14611; GFX12-NEXT:    s_wait_expcnt 0x0
14612; GFX12-NEXT:    s_wait_samplecnt 0x0
14613; GFX12-NEXT:    s_wait_bvhcnt 0x0
14614; GFX12-NEXT:    s_wait_kmcnt 0x0
14615; GFX12-NEXT:    s_wait_storecnt 0x0
14616; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
14617; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14618; GFX12-NEXT:    global_inv scope:SCOPE_DEV
14619; GFX12-NEXT:    s_setpc_b64 s[30:31]
14620;
14621; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
14622; GFX940:       ; %bb.0:
14623; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14624; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
14625; GFX940-NEXT:    s_nop 1
14626; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
14627; GFX940-NEXT:    buffer_wbl2 sc1
14628; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
14629; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14630; GFX940-NEXT:    buffer_inv sc1
14631; GFX940-NEXT:    s_setpc_b64 s[30:31]
14632;
14633; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
14634; GFX11:       ; %bb.0:
14635; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14636; GFX11-NEXT:    v_mov_b32_e32 v3, v0
14637; GFX11-NEXT:    s_mov_b32 s0, 0
14638; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14639; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
14640; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
14641; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
14642; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
14643; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
14644; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
14645; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
14646; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14647; GFX11-NEXT:    v_mov_b32_e32 v1, v0
14648; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14649; GFX11-NEXT:    v_pk_add_f16 v0, v1, v2
14650; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
14651; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
14652; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14653; GFX11-NEXT:    buffer_gl1_inv
14654; GFX11-NEXT:    buffer_gl0_inv
14655; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
14656; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
14657; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
14658; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
14659; GFX11-NEXT:    s_cbranch_execnz .LBB58_1
14660; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
14661; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
14662; GFX11-NEXT:    s_setpc_b64 s[30:31]
14663;
14664; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
14665; GFX10:       ; %bb.0:
14666; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14667; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
14668; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
14669; GFX10-NEXT:    s_mov_b32 s4, 0
14670; GFX10-NEXT:    flat_load_dword v0, v[3:4]
14671; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
14672; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
14673; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14674; GFX10-NEXT:    v_mov_b32_e32 v1, v0
14675; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2
14676; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
14677; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
14678; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14679; GFX10-NEXT:    buffer_gl1_inv
14680; GFX10-NEXT:    buffer_gl0_inv
14681; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
14682; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
14683; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
14684; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
14685; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
14686; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
14687; GFX10-NEXT:    s_setpc_b64 s[30:31]
14688;
14689; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
14690; GFX90A:       ; %bb.0:
14691; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14692; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
14693; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
14694; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
14695; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
14696; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
14697; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
14698; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
14699; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
14700; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14701; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
14702; GFX90A-NEXT:    v_pk_add_f16 v0, v1, v2
14703; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
14704; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14705; GFX90A-NEXT:    buffer_wbinvl1
14706; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
14707; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14708; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14709; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
14710; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
14711; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
14712; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14713;
14714; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
14715; GFX908:       ; %bb.0:
14716; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14717; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
14718; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
14719; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
14720; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
14721; GFX908-NEXT:    flat_load_dword v0, v[0:1]
14722; GFX908-NEXT:    s_mov_b64 s[4:5], 0
14723; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
14724; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
14725; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14726; GFX908-NEXT:    v_mov_b32_e32 v1, v0
14727; GFX908-NEXT:    v_pk_add_f16 v0, v1, v2
14728; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
14729; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14730; GFX908-NEXT:    buffer_wbinvl1
14731; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
14732; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14733; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14734; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
14735; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
14736; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
14737; GFX908-NEXT:    s_setpc_b64 s[30:31]
14738;
14739; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
14740; GFX8:       ; %bb.0:
14741; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14742; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
14743; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
14744; GFX8-NEXT:    flat_load_dword v0, v[3:4]
14745; GFX8-NEXT:    s_mov_b64 s[4:5], 0
14746; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
14747; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
14748; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14749; GFX8-NEXT:    v_mov_b32_e32 v1, v0
14750; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
14751; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
14752; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
14753; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
14754; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
14755; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
14756; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14757; GFX8-NEXT:    buffer_wbinvl1
14758; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
14759; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14760; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14761; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
14762; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
14763; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
14764; GFX8-NEXT:    s_setpc_b64 s[30:31]
14765;
14766; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
14767; GFX7:       ; %bb.0:
14768; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14769; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
14770; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
14771; GFX7-NEXT:    flat_load_dword v1, v[4:5]
14772; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
14773; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
14774; GFX7-NEXT:    s_mov_b64 s[4:5], 0
14775; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
14776; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
14777; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14778; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
14779; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
14780; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
14781; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
14782; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
14783; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
14784; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
14785; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
14786; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
14787; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
14788; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
14789; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
14790; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
14791; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
14792; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
14793; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
14794; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
14795; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
14796; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14797; GFX7-NEXT:    buffer_wbinvl1
14798; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
14799; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
14800; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
14801; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
14802; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14803; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14804; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
14805; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
14806; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
14807; GFX7-NEXT:    s_setpc_b64 s[30:31]
14808  %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
14809  %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
14810  ret <2 x half> %result
14811}
14812
14813define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
14814; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
14815; GFX12:       ; %bb.0:
14816; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14817; GFX12-NEXT:    s_wait_expcnt 0x0
14818; GFX12-NEXT:    s_wait_samplecnt 0x0
14819; GFX12-NEXT:    s_wait_bvhcnt 0x0
14820; GFX12-NEXT:    s_wait_kmcnt 0x0
14821; GFX12-NEXT:    s_wait_storecnt 0x0
14822; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV
14823; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
14824; GFX12-NEXT:    global_inv scope:SCOPE_DEV
14825; GFX12-NEXT:    s_setpc_b64 s[30:31]
14826;
14827; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
14828; GFX940:       ; %bb.0:
14829; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14830; GFX940-NEXT:    buffer_wbl2 sc1
14831; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
14832; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14833; GFX940-NEXT:    buffer_inv sc1
14834; GFX940-NEXT:    s_setpc_b64 s[30:31]
14835;
14836; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
14837; GFX11:       ; %bb.0:
14838; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14839; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
14840; GFX11-NEXT:    s_mov_b32 s0, 0
14841; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
14842; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
14843; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14844; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
14845; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
14846; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
14847; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14848; GFX11-NEXT:    buffer_gl1_inv
14849; GFX11-NEXT:    buffer_gl0_inv
14850; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
14851; GFX11-NEXT:    v_mov_b32_e32 v4, v3
14852; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
14853; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
14854; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
14855; GFX11-NEXT:    s_cbranch_execnz .LBB59_1
14856; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
14857; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
14858; GFX11-NEXT:    s_setpc_b64 s[30:31]
14859;
14860; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
14861; GFX10:       ; %bb.0:
14862; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14863; GFX10-NEXT:    flat_load_dword v4, v[0:1]
14864; GFX10-NEXT:    s_mov_b32 s4, 0
14865; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
14866; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
14867; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14868; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
14869; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
14870; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
14871; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14872; GFX10-NEXT:    buffer_gl1_inv
14873; GFX10-NEXT:    buffer_gl0_inv
14874; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
14875; GFX10-NEXT:    v_mov_b32_e32 v4, v3
14876; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
14877; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
14878; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
14879; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
14880; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
14881; GFX10-NEXT:    s_setpc_b64 s[30:31]
14882;
14883; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
14884; GFX90A:       ; %bb.0:
14885; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14886; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
14887; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
14888; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
14889; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
14890; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14891; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
14892; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
14893; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14894; GFX90A-NEXT:    buffer_wbinvl1
14895; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
14896; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14897; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
14898; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14899; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
14900; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
14901; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
14902; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14903;
14904; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
14905; GFX908:       ; %bb.0:
14906; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14907; GFX908-NEXT:    flat_load_dword v4, v[0:1]
14908; GFX908-NEXT:    s_mov_b64 s[4:5], 0
14909; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
14910; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
14911; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14912; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
14913; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
14914; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14915; GFX908-NEXT:    buffer_wbinvl1
14916; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
14917; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14918; GFX908-NEXT:    v_mov_b32_e32 v4, v3
14919; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14920; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
14921; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
14922; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
14923; GFX908-NEXT:    s_setpc_b64 s[30:31]
14924;
14925; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
14926; GFX8:       ; %bb.0:
14927; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14928; GFX8-NEXT:    flat_load_dword v4, v[0:1]
14929; GFX8-NEXT:    s_mov_b64 s[4:5], 0
14930; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
14931; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
14932; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14933; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
14934; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
14935; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
14936; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
14937; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
14938; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
14939; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14940; GFX8-NEXT:    buffer_wbinvl1
14941; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
14942; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14943; GFX8-NEXT:    v_mov_b32_e32 v4, v3
14944; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14945; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
14946; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
14947; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
14948; GFX8-NEXT:    s_setpc_b64 s[30:31]
14949;
14950; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
14951; GFX7:       ; %bb.0:
14952; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14953; GFX7-NEXT:    flat_load_dword v5, v[0:1]
14954; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
14955; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
14956; GFX7-NEXT:    s_mov_b64 s[4:5], 0
14957; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
14958; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14959; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
14960; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
14961; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
14962; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
14963; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
14964; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
14965; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
14966; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
14967; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
14968; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
14969; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
14970; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
14971; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
14972; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
14973; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
14974; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
14975; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
14976; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
14977; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
14978; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14979; GFX7-NEXT:    buffer_wbinvl1
14980; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
14981; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
14982; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
14983; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
14984; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14985; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14986; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
14987; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
14988; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
14989; GFX7-NEXT:    s_setpc_b64 s[30:31]
14990  %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
14991  ret void
14992}
14993
14994define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
14995; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14996; GFX12:       ; %bb.0:
14997; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14998; GFX12-NEXT:    s_wait_expcnt 0x0
14999; GFX12-NEXT:    s_wait_samplecnt 0x0
15000; GFX12-NEXT:    s_wait_bvhcnt 0x0
15001; GFX12-NEXT:    s_wait_kmcnt 0x0
15002; GFX12-NEXT:    s_wait_storecnt 0x0
15003; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_DEV
15004; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
15005; GFX12-NEXT:    global_inv scope:SCOPE_DEV
15006; GFX12-NEXT:    s_setpc_b64 s[30:31]
15007;
15008; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15009; GFX940:       ; %bb.0:
15010; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15011; GFX940-NEXT:    buffer_wbl2 sc1
15012; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
15013; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15014; GFX940-NEXT:    buffer_inv sc1
15015; GFX940-NEXT:    s_setpc_b64 s[30:31]
15016;
15017; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15018; GFX11:       ; %bb.0:
15019; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15020; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
15021; GFX11-NEXT:    s_mov_b32 s0, 0
15022; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
15023; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
15024; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15025; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
15026; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
15027; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
15028; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15029; GFX11-NEXT:    buffer_gl1_inv
15030; GFX11-NEXT:    buffer_gl0_inv
15031; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
15032; GFX11-NEXT:    v_mov_b32_e32 v4, v3
15033; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
15034; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15035; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
15036; GFX11-NEXT:    s_cbranch_execnz .LBB60_1
15037; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
15038; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
15039; GFX11-NEXT:    s_setpc_b64 s[30:31]
15040;
15041; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15042; GFX10:       ; %bb.0:
15043; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15044; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
15045; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
15046; GFX10-NEXT:    s_mov_b32 s4, 0
15047; GFX10-NEXT:    flat_load_dword v4, v[0:1]
15048; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
15049; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
15050; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15051; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
15052; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
15053; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
15054; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15055; GFX10-NEXT:    buffer_gl1_inv
15056; GFX10-NEXT:    buffer_gl0_inv
15057; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
15058; GFX10-NEXT:    v_mov_b32_e32 v4, v3
15059; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
15060; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
15061; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
15062; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
15063; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
15064; GFX10-NEXT:    s_setpc_b64 s[30:31]
15065;
15066; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15067; GFX90A:       ; %bb.0:
15068; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15069; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
15070; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
15071; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
15072; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
15073; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15074; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
15075; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
15076; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15077; GFX90A-NEXT:    buffer_wbinvl1
15078; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
15079; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15080; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
15081; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15082; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
15083; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
15084; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
15085; GFX90A-NEXT:    s_setpc_b64 s[30:31]
15086;
15087; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15088; GFX908:       ; %bb.0:
15089; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15090; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
15091; GFX908-NEXT:    s_mov_b64 s[4:5], 0
15092; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
15093; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
15094; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15095; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
15096; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
15097; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15098; GFX908-NEXT:    buffer_wbinvl1
15099; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
15100; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15101; GFX908-NEXT:    v_mov_b32_e32 v4, v3
15102; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15103; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
15104; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
15105; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
15106; GFX908-NEXT:    s_setpc_b64 s[30:31]
15107;
15108; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15109; GFX8:       ; %bb.0:
15110; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15111; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
15112; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
15113; GFX8-NEXT:    flat_load_dword v4, v[0:1]
15114; GFX8-NEXT:    s_mov_b64 s[4:5], 0
15115; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
15116; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
15117; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15118; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
15119; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
15120; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
15121; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
15122; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
15123; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
15124; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15125; GFX8-NEXT:    buffer_wbinvl1
15126; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
15127; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15128; GFX8-NEXT:    v_mov_b32_e32 v4, v3
15129; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15130; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
15131; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
15132; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
15133; GFX8-NEXT:    s_setpc_b64 s[30:31]
15134;
15135; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15136; GFX7:       ; %bb.0:
15137; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15138; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
15139; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
15140; GFX7-NEXT:    flat_load_dword v5, v[0:1]
15141; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
15142; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
15143; GFX7-NEXT:    s_mov_b64 s[4:5], 0
15144; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
15145; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15146; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
15147; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
15148; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
15149; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
15150; GFX7-NEXT:  .LBB60_1: ; %atomicrmw.start
15151; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
15152; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
15153; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
15154; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
15155; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
15156; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
15157; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
15158; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
15159; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
15160; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
15161; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
15162; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
15163; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
15164; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
15165; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15166; GFX7-NEXT:    buffer_wbinvl1
15167; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
15168; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
15169; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
15170; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
15171; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15172; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15173; GFX7-NEXT:    s_cbranch_execnz .LBB60_1
15174; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
15175; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
15176; GFX7-NEXT:    s_setpc_b64 s[30:31]
15177  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
15178  %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
15179  ret void
15180}
15181
15182define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
15183; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
15184; GFX12:       ; %bb.0:
15185; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15186; GFX12-NEXT:    s_wait_expcnt 0x0
15187; GFX12-NEXT:    s_wait_samplecnt 0x0
15188; GFX12-NEXT:    s_wait_bvhcnt 0x0
15189; GFX12-NEXT:    s_wait_kmcnt 0x0
15190; GFX12-NEXT:    s_wait_storecnt 0x0
15191; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
15192; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
15193; GFX12-NEXT:    global_inv scope:SCOPE_DEV
15194; GFX12-NEXT:    s_setpc_b64 s[30:31]
15195;
15196; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
15197; GFX940:       ; %bb.0:
15198; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15199; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
15200; GFX940-NEXT:    s_nop 1
15201; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
15202; GFX940-NEXT:    buffer_wbl2 sc1
15203; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
15204; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15205; GFX940-NEXT:    buffer_inv sc1
15206; GFX940-NEXT:    s_setpc_b64 s[30:31]
15207;
15208; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
15209; GFX11:       ; %bb.0:
15210; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15211; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
15212; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
15213; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
15214; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
15215; GFX11-NEXT:    flat_load_b32 v4, v[3:4]
15216; GFX11-NEXT:    s_mov_b32 s0, 0
15217; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
15218; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
15219; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15220; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
15221; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
15222; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
15223; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15224; GFX11-NEXT:    buffer_gl1_inv
15225; GFX11-NEXT:    buffer_gl0_inv
15226; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
15227; GFX11-NEXT:    v_mov_b32_e32 v4, v3
15228; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
15229; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15230; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
15231; GFX11-NEXT:    s_cbranch_execnz .LBB61_1
15232; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
15233; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
15234; GFX11-NEXT:    s_setpc_b64 s[30:31]
15235;
15236; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
15237; GFX10:       ; %bb.0:
15238; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15239; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
15240; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
15241; GFX10-NEXT:    s_mov_b32 s4, 0
15242; GFX10-NEXT:    flat_load_dword v4, v[0:1]
15243; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
15244; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
15245; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15246; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
15247; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
15248; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
15249; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15250; GFX10-NEXT:    buffer_gl1_inv
15251; GFX10-NEXT:    buffer_gl0_inv
15252; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
15253; GFX10-NEXT:    v_mov_b32_e32 v4, v3
15254; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
15255; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
15256; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
15257; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
15258; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
15259; GFX10-NEXT:    s_setpc_b64 s[30:31]
15260;
15261; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
15262; GFX90A:       ; %bb.0:
15263; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15264; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
15265; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
15266; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
15267; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
15268; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
15269; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
15270; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
15271; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
15272; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15273; GFX90A-NEXT:    v_pk_add_f16 v0, v1, v2
15274; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
15275; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15276; GFX90A-NEXT:    buffer_wbinvl1
15277; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
15278; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15279; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
15280; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15281; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
15282; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
15283; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
15284; GFX90A-NEXT:    s_setpc_b64 s[30:31]
15285;
15286; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
15287; GFX908:       ; %bb.0:
15288; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15289; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
15290; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
15291; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
15292; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
15293; GFX908-NEXT:    flat_load_dword v1, v[0:1]
15294; GFX908-NEXT:    s_mov_b64 s[4:5], 0
15295; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
15296; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
15297; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15298; GFX908-NEXT:    v_pk_add_f16 v0, v1, v2
15299; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
15300; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15301; GFX908-NEXT:    buffer_wbinvl1
15302; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
15303; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15304; GFX908-NEXT:    v_mov_b32_e32 v1, v0
15305; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15306; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
15307; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
15308; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
15309; GFX908-NEXT:    s_setpc_b64 s[30:31]
15310;
15311; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
15312; GFX8:       ; %bb.0:
15313; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15314; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
15315; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
15316; GFX8-NEXT:    flat_load_dword v4, v[0:1]
15317; GFX8-NEXT:    s_mov_b64 s[4:5], 0
15318; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
15319; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
15320; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15321; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
15322; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
15323; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
15324; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
15325; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
15326; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
15327; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15328; GFX8-NEXT:    buffer_wbinvl1
15329; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
15330; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15331; GFX8-NEXT:    v_mov_b32_e32 v4, v3
15332; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15333; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
15334; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
15335; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
15336; GFX8-NEXT:    s_setpc_b64 s[30:31]
15337;
15338; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
15339; GFX7:       ; %bb.0:
15340; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15341; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
15342; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
15343; GFX7-NEXT:    flat_load_dword v5, v[0:1]
15344; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
15345; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
15346; GFX7-NEXT:    s_mov_b64 s[4:5], 0
15347; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
15348; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15349; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
15350; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
15351; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
15352; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
15353; GFX7-NEXT:  .LBB61_1: ; %atomicrmw.start
15354; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
15355; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
15356; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
15357; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
15358; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
15359; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
15360; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
15361; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
15362; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
15363; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
15364; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
15365; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
15366; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
15367; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
15368; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15369; GFX7-NEXT:    buffer_wbinvl1
15370; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
15371; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
15372; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
15373; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
15374; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15375; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15376; GFX7-NEXT:    s_cbranch_execnz .LBB61_1
15377; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
15378; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
15379; GFX7-NEXT:    s_setpc_b64 s[30:31]
15380  %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
15381  %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
15382  ret void
15383}
15384
15385define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
15386; GFX12-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15387; GFX12:       ; %bb.0:
15388; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15389; GFX12-NEXT:    s_wait_expcnt 0x0
15390; GFX12-NEXT:    s_wait_samplecnt 0x0
15391; GFX12-NEXT:    s_wait_bvhcnt 0x0
15392; GFX12-NEXT:    s_wait_kmcnt 0x0
15393; GFX12-NEXT:    global_wb scope:SCOPE_SYS
15394; GFX12-NEXT:    s_wait_storecnt 0x0
15395; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
15396; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15397; GFX12-NEXT:    global_inv scope:SCOPE_SYS
15398; GFX12-NEXT:    s_setpc_b64 s[30:31]
15399;
15400; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15401; GFX940:       ; %bb.0:
15402; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15403; GFX940-NEXT:    buffer_wbl2 sc0 sc1
15404; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 sc1
15405; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15406; GFX940-NEXT:    buffer_inv sc0 sc1
15407; GFX940-NEXT:    s_setpc_b64 s[30:31]
15408;
15409; GFX11-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15410; GFX11:       ; %bb.0:
15411; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15412; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
15413; GFX11-NEXT:    s_mov_b32 s0, 0
15414; GFX11-NEXT:  .LBB62_1: ; %atomicrmw.start
15415; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
15416; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15417; GFX11-NEXT:    v_mov_b32_e32 v4, v3
15418; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15419; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
15420; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
15421; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
15422; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15423; GFX11-NEXT:    buffer_gl1_inv
15424; GFX11-NEXT:    buffer_gl0_inv
15425; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
15426; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
15427; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15428; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
15429; GFX11-NEXT:    s_cbranch_execnz .LBB62_1
15430; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
15431; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
15432; GFX11-NEXT:    v_mov_b32_e32 v0, v3
15433; GFX11-NEXT:    s_setpc_b64 s[30:31]
15434;
15435; GFX10-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15436; GFX10:       ; %bb.0:
15437; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15438; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
15439; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
15440; GFX10-NEXT:    s_mov_b32 s4, 0
15441; GFX10-NEXT:    flat_load_dword v0, v[3:4]
15442; GFX10-NEXT:  .LBB62_1: ; %atomicrmw.start
15443; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
15444; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15445; GFX10-NEXT:    v_mov_b32_e32 v1, v0
15446; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2
15447; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
15448; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
15449; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15450; GFX10-NEXT:    buffer_gl1_inv
15451; GFX10-NEXT:    buffer_gl0_inv
15452; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
15453; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
15454; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
15455; GFX10-NEXT:    s_cbranch_execnz .LBB62_1
15456; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
15457; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
15458; GFX10-NEXT:    s_setpc_b64 s[30:31]
15459;
15460; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15461; GFX90A:       ; %bb.0:
15462; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15463; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
15464; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
15465; GFX90A-NEXT:  .LBB62_1: ; %atomicrmw.start
15466; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
15467; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15468; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
15469; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
15470; GFX90A-NEXT:    buffer_wbl2
15471; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
15472; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15473; GFX90A-NEXT:    buffer_invl2
15474; GFX90A-NEXT:    buffer_wbinvl1
15475; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
15476; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15477; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15478; GFX90A-NEXT:    s_cbranch_execnz .LBB62_1
15479; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
15480; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
15481; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
15482; GFX90A-NEXT:    s_setpc_b64 s[30:31]
15483;
15484; GFX908-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15485; GFX908:       ; %bb.0:
15486; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15487; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
15488; GFX908-NEXT:    s_mov_b64 s[4:5], 0
15489; GFX908-NEXT:  .LBB62_1: ; %atomicrmw.start
15490; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
15491; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15492; GFX908-NEXT:    v_mov_b32_e32 v4, v3
15493; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
15494; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
15495; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15496; GFX908-NEXT:    buffer_wbinvl1
15497; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
15498; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15499; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15500; GFX908-NEXT:    s_cbranch_execnz .LBB62_1
15501; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
15502; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
15503; GFX908-NEXT:    v_mov_b32_e32 v0, v3
15504; GFX908-NEXT:    s_setpc_b64 s[30:31]
15505;
15506; GFX8-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15507; GFX8:       ; %bb.0:
15508; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15509; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
15510; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
15511; GFX8-NEXT:    flat_load_dword v0, v[3:4]
15512; GFX8-NEXT:    s_mov_b64 s[4:5], 0
15513; GFX8-NEXT:  .LBB62_1: ; %atomicrmw.start
15514; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
15515; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15516; GFX8-NEXT:    v_mov_b32_e32 v1, v0
15517; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
15518; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
15519; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
15520; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
15521; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
15522; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
15523; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15524; GFX8-NEXT:    buffer_wbinvl1
15525; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
15526; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15527; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15528; GFX8-NEXT:    s_cbranch_execnz .LBB62_1
15529; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
15530; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
15531; GFX8-NEXT:    s_setpc_b64 s[30:31]
15532;
15533; GFX7-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15534; GFX7:       ; %bb.0:
15535; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15536; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
15537; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
15538; GFX7-NEXT:    flat_load_dword v1, v[4:5]
15539; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
15540; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
15541; GFX7-NEXT:    s_mov_b64 s[4:5], 0
15542; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
15543; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
15544; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15545; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
15546; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
15547; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
15548; GFX7-NEXT:  .LBB62_1: ; %atomicrmw.start
15549; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
15550; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
15551; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
15552; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
15553; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
15554; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
15555; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
15556; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
15557; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
15558; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
15559; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
15560; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
15561; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
15562; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
15563; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15564; GFX7-NEXT:    buffer_wbinvl1
15565; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
15566; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
15567; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
15568; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
15569; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15570; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15571; GFX7-NEXT:    s_cbranch_execnz .LBB62_1
15572; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
15573; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
15574; GFX7-NEXT:    s_setpc_b64 s[30:31]
15575  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
15576  %result = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
15577  ret <2 x half> %result
15578}
15579
15580define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
15581; GFX12-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15582; GFX12:       ; %bb.0:
15583; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15584; GFX12-NEXT:    s_wait_expcnt 0x0
15585; GFX12-NEXT:    s_wait_samplecnt 0x0
15586; GFX12-NEXT:    s_wait_bvhcnt 0x0
15587; GFX12-NEXT:    s_wait_kmcnt 0x0
15588; GFX12-NEXT:    global_wb scope:SCOPE_SYS
15589; GFX12-NEXT:    s_wait_storecnt 0x0
15590; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_SYS
15591; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
15592; GFX12-NEXT:    global_inv scope:SCOPE_SYS
15593; GFX12-NEXT:    s_setpc_b64 s[30:31]
15594;
15595; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15596; GFX940:       ; %bb.0:
15597; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15598; GFX940-NEXT:    buffer_wbl2 sc0 sc1
15599; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 sc1
15600; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15601; GFX940-NEXT:    buffer_inv sc0 sc1
15602; GFX940-NEXT:    s_setpc_b64 s[30:31]
15603;
15604; GFX11-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15605; GFX11:       ; %bb.0:
15606; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15607; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:2044
15608; GFX11-NEXT:    s_mov_b32 s0, 0
15609; GFX11-NEXT:  .LBB63_1: ; %atomicrmw.start
15610; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
15611; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15612; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
15613; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
15614; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
15615; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15616; GFX11-NEXT:    buffer_gl1_inv
15617; GFX11-NEXT:    buffer_gl0_inv
15618; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
15619; GFX11-NEXT:    v_mov_b32_e32 v4, v3
15620; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
15621; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15622; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
15623; GFX11-NEXT:    s_cbranch_execnz .LBB63_1
15624; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
15625; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
15626; GFX11-NEXT:    s_setpc_b64 s[30:31]
15627;
15628; GFX10-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15629; GFX10:       ; %bb.0:
15630; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15631; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
15632; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
15633; GFX10-NEXT:    s_mov_b32 s4, 0
15634; GFX10-NEXT:    flat_load_dword v4, v[0:1]
15635; GFX10-NEXT:  .LBB63_1: ; %atomicrmw.start
15636; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
15637; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15638; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
15639; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
15640; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
15641; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15642; GFX10-NEXT:    buffer_gl1_inv
15643; GFX10-NEXT:    buffer_gl0_inv
15644; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
15645; GFX10-NEXT:    v_mov_b32_e32 v4, v3
15646; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
15647; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
15648; GFX10-NEXT:    s_cbranch_execnz .LBB63_1
15649; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
15650; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
15651; GFX10-NEXT:    s_setpc_b64 s[30:31]
15652;
15653; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15654; GFX90A:       ; %bb.0:
15655; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15656; GFX90A-NEXT:    flat_load_dword v5, v[0:1] offset:2044
15657; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
15658; GFX90A-NEXT:  .LBB63_1: ; %atomicrmw.start
15659; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
15660; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15661; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
15662; GFX90A-NEXT:    buffer_wbl2
15663; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
15664; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15665; GFX90A-NEXT:    buffer_invl2
15666; GFX90A-NEXT:    buffer_wbinvl1
15667; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
15668; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15669; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
15670; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15671; GFX90A-NEXT:    s_cbranch_execnz .LBB63_1
15672; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
15673; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
15674; GFX90A-NEXT:    s_setpc_b64 s[30:31]
15675;
15676; GFX908-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15677; GFX908:       ; %bb.0:
15678; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15679; GFX908-NEXT:    flat_load_dword v4, v[0:1] offset:2044
15680; GFX908-NEXT:    s_mov_b64 s[4:5], 0
15681; GFX908-NEXT:  .LBB63_1: ; %atomicrmw.start
15682; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
15683; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15684; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
15685; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
15686; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15687; GFX908-NEXT:    buffer_wbinvl1
15688; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
15689; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15690; GFX908-NEXT:    v_mov_b32_e32 v4, v3
15691; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15692; GFX908-NEXT:    s_cbranch_execnz .LBB63_1
15693; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
15694; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
15695; GFX908-NEXT:    s_setpc_b64 s[30:31]
15696;
15697; GFX8-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15698; GFX8:       ; %bb.0:
15699; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15700; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
15701; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
15702; GFX8-NEXT:    flat_load_dword v4, v[0:1]
15703; GFX8-NEXT:    s_mov_b64 s[4:5], 0
15704; GFX8-NEXT:  .LBB63_1: ; %atomicrmw.start
15705; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
15706; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15707; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
15708; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
15709; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
15710; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
15711; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
15712; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
15713; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15714; GFX8-NEXT:    buffer_wbinvl1
15715; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
15716; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15717; GFX8-NEXT:    v_mov_b32_e32 v4, v3
15718; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15719; GFX8-NEXT:    s_cbranch_execnz .LBB63_1
15720; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
15721; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
15722; GFX8-NEXT:    s_setpc_b64 s[30:31]
15723;
15724; GFX7-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
15725; GFX7:       ; %bb.0:
15726; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15727; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
15728; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
15729; GFX7-NEXT:    flat_load_dword v5, v[0:1]
15730; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
15731; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
15732; GFX7-NEXT:    s_mov_b64 s[4:5], 0
15733; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
15734; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15735; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
15736; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
15737; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
15738; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
15739; GFX7-NEXT:  .LBB63_1: ; %atomicrmw.start
15740; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
15741; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
15742; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
15743; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
15744; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
15745; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
15746; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
15747; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
15748; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
15749; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
15750; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
15751; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
15752; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
15753; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
15754; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15755; GFX7-NEXT:    buffer_wbinvl1
15756; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
15757; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
15758; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
15759; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
15760; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15761; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15762; GFX7-NEXT:    s_cbranch_execnz .LBB63_1
15763; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
15764; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
15765; GFX7-NEXT:    s_setpc_b64 s[30:31]
15766  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
15767  %unused = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
15768  ret void
15769}
15770
15771define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 {
15772; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
15773; GFX12:       ; %bb.0:
15774; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15775; GFX12-NEXT:    s_wait_expcnt 0x0
15776; GFX12-NEXT:    s_wait_samplecnt 0x0
15777; GFX12-NEXT:    s_wait_bvhcnt 0x0
15778; GFX12-NEXT:    s_wait_kmcnt 0x0
15779; GFX12-NEXT:    s_wait_storecnt 0x0
15780; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
15781; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15782; GFX12-NEXT:    global_inv scope:SCOPE_DEV
15783; GFX12-NEXT:    s_setpc_b64 s[30:31]
15784;
15785; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
15786; GFX940:       ; %bb.0:
15787; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15788; GFX940-NEXT:    buffer_wbl2 sc1
15789; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
15790; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15791; GFX940-NEXT:    buffer_inv sc1
15792; GFX940-NEXT:    s_setpc_b64 s[30:31]
15793;
15794; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
15795; GFX11:       ; %bb.0:
15796; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15797; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
15798; GFX11-NEXT:    s_mov_b32 s0, 0
15799; GFX11-NEXT:  .LBB64_1: ; %atomicrmw.start
15800; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
15801; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15802; GFX11-NEXT:    v_mov_b32_e32 v4, v3
15803; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15804; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
15805; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
15806; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
15807; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15808; GFX11-NEXT:    buffer_gl1_inv
15809; GFX11-NEXT:    buffer_gl0_inv
15810; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
15811; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
15812; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15813; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
15814; GFX11-NEXT:    s_cbranch_execnz .LBB64_1
15815; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
15816; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
15817; GFX11-NEXT:    v_mov_b32_e32 v0, v3
15818; GFX11-NEXT:    s_setpc_b64 s[30:31]
15819;
15820; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
15821; GFX10:       ; %bb.0:
15822; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15823; GFX10-NEXT:    flat_load_dword v3, v[0:1]
15824; GFX10-NEXT:    s_mov_b32 s4, 0
15825; GFX10-NEXT:  .LBB64_1: ; %atomicrmw.start
15826; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
15827; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15828; GFX10-NEXT:    v_mov_b32_e32 v4, v3
15829; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
15830; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
15831; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
15832; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15833; GFX10-NEXT:    buffer_gl1_inv
15834; GFX10-NEXT:    buffer_gl0_inv
15835; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
15836; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
15837; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
15838; GFX10-NEXT:    s_cbranch_execnz .LBB64_1
15839; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
15840; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
15841; GFX10-NEXT:    v_mov_b32_e32 v0, v3
15842; GFX10-NEXT:    s_setpc_b64 s[30:31]
15843;
15844; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
15845; GFX90A:       ; %bb.0:
15846; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15847; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
15848; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
15849; GFX90A-NEXT:  .LBB64_1: ; %atomicrmw.start
15850; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
15851; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15852; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
15853; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
15854; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
15855; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15856; GFX90A-NEXT:    buffer_wbinvl1
15857; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
15858; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15859; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15860; GFX90A-NEXT:    s_cbranch_execnz .LBB64_1
15861; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
15862; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
15863; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
15864; GFX90A-NEXT:    s_setpc_b64 s[30:31]
15865;
15866; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
15867; GFX908:       ; %bb.0:
15868; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15869; GFX908-NEXT:    flat_load_dword v3, v[0:1]
15870; GFX908-NEXT:    s_mov_b64 s[4:5], 0
15871; GFX908-NEXT:  .LBB64_1: ; %atomicrmw.start
15872; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
15873; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15874; GFX908-NEXT:    v_mov_b32_e32 v4, v3
15875; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
15876; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
15877; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15878; GFX908-NEXT:    buffer_wbinvl1
15879; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
15880; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15881; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15882; GFX908-NEXT:    s_cbranch_execnz .LBB64_1
15883; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
15884; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
15885; GFX908-NEXT:    v_mov_b32_e32 v0, v3
15886; GFX908-NEXT:    s_setpc_b64 s[30:31]
15887;
15888; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
15889; GFX8:       ; %bb.0:
15890; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15891; GFX8-NEXT:    flat_load_dword v3, v[0:1]
15892; GFX8-NEXT:    s_mov_b64 s[4:5], 0
15893; GFX8-NEXT:  .LBB64_1: ; %atomicrmw.start
15894; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
15895; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15896; GFX8-NEXT:    v_mov_b32_e32 v4, v3
15897; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
15898; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
15899; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
15900; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
15901; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
15902; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
15903; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15904; GFX8-NEXT:    buffer_wbinvl1
15905; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
15906; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15907; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15908; GFX8-NEXT:    s_cbranch_execnz .LBB64_1
15909; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
15910; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
15911; GFX8-NEXT:    v_mov_b32_e32 v0, v3
15912; GFX8-NEXT:    s_setpc_b64 s[30:31]
15913;
15914; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
15915; GFX7:       ; %bb.0:
15916; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15917; GFX7-NEXT:    flat_load_dword v5, v[0:1]
15918; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
15919; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
15920; GFX7-NEXT:    s_mov_b64 s[4:5], 0
15921; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
15922; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15923; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
15924; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
15925; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
15926; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
15927; GFX7-NEXT:  .LBB64_1: ; %atomicrmw.start
15928; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
15929; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
15930; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
15931; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
15932; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
15933; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
15934; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
15935; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
15936; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
15937; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
15938; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
15939; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
15940; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
15941; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[6:7] glc
15942; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15943; GFX7-NEXT:    buffer_wbinvl1
15944; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
15945; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v6
15946; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
15947; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
15948; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15949; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15950; GFX7-NEXT:    s_cbranch_execnz .LBB64_1
15951; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
15952; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
15953; GFX7-NEXT:    v_mov_b32_e32 v0, v2
15954; GFX7-NEXT:    v_mov_b32_e32 v1, v3
15955; GFX7-NEXT:    s_setpc_b64 s[30:31]
15956  %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
15957  ret <2 x half> %result
15958}
15959
15960define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 {
15961; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
15962; GFX12:       ; %bb.0:
15963; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15964; GFX12-NEXT:    s_wait_expcnt 0x0
15965; GFX12-NEXT:    s_wait_samplecnt 0x0
15966; GFX12-NEXT:    s_wait_bvhcnt 0x0
15967; GFX12-NEXT:    s_wait_kmcnt 0x0
15968; GFX12-NEXT:    s_wait_storecnt 0x0
15969; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV
15970; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
15971; GFX12-NEXT:    global_inv scope:SCOPE_DEV
15972; GFX12-NEXT:    s_setpc_b64 s[30:31]
15973;
15974; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
15975; GFX940:       ; %bb.0:
15976; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15977; GFX940-NEXT:    buffer_wbl2 sc1
15978; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
15979; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15980; GFX940-NEXT:    buffer_inv sc1
15981; GFX940-NEXT:    s_setpc_b64 s[30:31]
15982;
15983; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
15984; GFX11:       ; %bb.0:
15985; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15986; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
15987; GFX11-NEXT:    s_mov_b32 s0, 0
15988; GFX11-NEXT:  .LBB65_1: ; %atomicrmw.start
15989; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
15990; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15991; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
15992; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
15993; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
15994; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15995; GFX11-NEXT:    buffer_gl1_inv
15996; GFX11-NEXT:    buffer_gl0_inv
15997; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
15998; GFX11-NEXT:    v_mov_b32_e32 v4, v3
15999; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
16000; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
16001; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
16002; GFX11-NEXT:    s_cbranch_execnz .LBB65_1
16003; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
16004; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
16005; GFX11-NEXT:    s_setpc_b64 s[30:31]
16006;
16007; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
16008; GFX10:       ; %bb.0:
16009; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16010; GFX10-NEXT:    flat_load_dword v4, v[0:1]
16011; GFX10-NEXT:    s_mov_b32 s4, 0
16012; GFX10-NEXT:  .LBB65_1: ; %atomicrmw.start
16013; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
16014; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16015; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
16016; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
16017; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
16018; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16019; GFX10-NEXT:    buffer_gl1_inv
16020; GFX10-NEXT:    buffer_gl0_inv
16021; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
16022; GFX10-NEXT:    v_mov_b32_e32 v4, v3
16023; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
16024; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
16025; GFX10-NEXT:    s_cbranch_execnz .LBB65_1
16026; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
16027; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
16028; GFX10-NEXT:    s_setpc_b64 s[30:31]
16029;
16030; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
16031; GFX90A:       ; %bb.0:
16032; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16033; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
16034; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
16035; GFX90A-NEXT:  .LBB65_1: ; %atomicrmw.start
16036; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
16037; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16038; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
16039; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
16040; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16041; GFX90A-NEXT:    buffer_wbinvl1
16042; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
16043; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16044; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
16045; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16046; GFX90A-NEXT:    s_cbranch_execnz .LBB65_1
16047; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
16048; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
16049; GFX90A-NEXT:    s_setpc_b64 s[30:31]
16050;
16051; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
16052; GFX908:       ; %bb.0:
16053; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16054; GFX908-NEXT:    flat_load_dword v4, v[0:1]
16055; GFX908-NEXT:    s_mov_b64 s[4:5], 0
16056; GFX908-NEXT:  .LBB65_1: ; %atomicrmw.start
16057; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
16058; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16059; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
16060; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
16061; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16062; GFX908-NEXT:    buffer_wbinvl1
16063; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
16064; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16065; GFX908-NEXT:    v_mov_b32_e32 v4, v3
16066; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16067; GFX908-NEXT:    s_cbranch_execnz .LBB65_1
16068; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
16069; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
16070; GFX908-NEXT:    s_setpc_b64 s[30:31]
16071;
16072; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
16073; GFX8:       ; %bb.0:
16074; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16075; GFX8-NEXT:    flat_load_dword v4, v[0:1]
16076; GFX8-NEXT:    s_mov_b64 s[4:5], 0
16077; GFX8-NEXT:  .LBB65_1: ; %atomicrmw.start
16078; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
16079; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16080; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
16081; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
16082; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
16083; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
16084; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
16085; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
16086; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16087; GFX8-NEXT:    buffer_wbinvl1
16088; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
16089; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16090; GFX8-NEXT:    v_mov_b32_e32 v4, v3
16091; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16092; GFX8-NEXT:    s_cbranch_execnz .LBB65_1
16093; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
16094; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
16095; GFX8-NEXT:    s_setpc_b64 s[30:31]
16096;
16097; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
16098; GFX7:       ; %bb.0:
16099; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16100; GFX7-NEXT:    flat_load_dword v5, v[0:1]
16101; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
16102; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
16103; GFX7-NEXT:    s_mov_b64 s[4:5], 0
16104; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
16105; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16106; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
16107; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
16108; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
16109; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
16110; GFX7-NEXT:  .LBB65_1: ; %atomicrmw.start
16111; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
16112; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
16113; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
16114; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
16115; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
16116; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
16117; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
16118; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
16119; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
16120; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
16121; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
16122; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
16123; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
16124; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
16125; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16126; GFX7-NEXT:    buffer_wbinvl1
16127; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
16128; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
16129; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
16130; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
16131; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16132; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16133; GFX7-NEXT:    s_cbranch_execnz .LBB65_1
16134; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
16135; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
16136; GFX7-NEXT:    s_setpc_b64 s[30:31]
16137  %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
16138  ret void
16139}
16140
16141define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 {
16142; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16143; GFX12:       ; %bb.0:
16144; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16145; GFX12-NEXT:    s_wait_expcnt 0x0
16146; GFX12-NEXT:    s_wait_samplecnt 0x0
16147; GFX12-NEXT:    s_wait_bvhcnt 0x0
16148; GFX12-NEXT:    s_wait_kmcnt 0x0
16149; GFX12-NEXT:    s_wait_storecnt 0x0
16150; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
16151; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16152; GFX12-NEXT:    global_inv scope:SCOPE_DEV
16153; GFX12-NEXT:    s_setpc_b64 s[30:31]
16154;
16155; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16156; GFX940:       ; %bb.0:
16157; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16158; GFX940-NEXT:    buffer_wbl2 sc1
16159; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
16160; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16161; GFX940-NEXT:    buffer_inv sc1
16162; GFX940-NEXT:    s_setpc_b64 s[30:31]
16163;
16164; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16165; GFX11:       ; %bb.0:
16166; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16167; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
16168; GFX11-NEXT:    s_mov_b32 s0, 0
16169; GFX11-NEXT:  .LBB66_1: ; %atomicrmw.start
16170; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
16171; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16172; GFX11-NEXT:    v_mov_b32_e32 v4, v3
16173; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16174; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
16175; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
16176; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
16177; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16178; GFX11-NEXT:    buffer_gl1_inv
16179; GFX11-NEXT:    buffer_gl0_inv
16180; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
16181; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
16182; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
16183; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
16184; GFX11-NEXT:    s_cbranch_execnz .LBB66_1
16185; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
16186; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
16187; GFX11-NEXT:    v_mov_b32_e32 v0, v3
16188; GFX11-NEXT:    s_setpc_b64 s[30:31]
16189;
16190; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16191; GFX10:       ; %bb.0:
16192; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16193; GFX10-NEXT:    flat_load_dword v3, v[0:1]
16194; GFX10-NEXT:    s_mov_b32 s4, 0
16195; GFX10-NEXT:  .LBB66_1: ; %atomicrmw.start
16196; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
16197; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16198; GFX10-NEXT:    v_mov_b32_e32 v4, v3
16199; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
16200; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
16201; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
16202; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16203; GFX10-NEXT:    buffer_gl1_inv
16204; GFX10-NEXT:    buffer_gl0_inv
16205; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
16206; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
16207; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
16208; GFX10-NEXT:    s_cbranch_execnz .LBB66_1
16209; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
16210; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
16211; GFX10-NEXT:    v_mov_b32_e32 v0, v3
16212; GFX10-NEXT:    s_setpc_b64 s[30:31]
16213;
16214; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16215; GFX90A:       ; %bb.0:
16216; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16217; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
16218; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
16219; GFX90A-NEXT:  .LBB66_1: ; %atomicrmw.start
16220; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
16221; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16222; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
16223; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
16224; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
16225; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16226; GFX90A-NEXT:    buffer_wbinvl1
16227; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
16228; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16229; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16230; GFX90A-NEXT:    s_cbranch_execnz .LBB66_1
16231; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
16232; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
16233; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
16234; GFX90A-NEXT:    s_setpc_b64 s[30:31]
16235;
16236; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16237; GFX908:       ; %bb.0:
16238; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16239; GFX908-NEXT:    flat_load_dword v3, v[0:1]
16240; GFX908-NEXT:    s_mov_b64 s[4:5], 0
16241; GFX908-NEXT:  .LBB66_1: ; %atomicrmw.start
16242; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
16243; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16244; GFX908-NEXT:    v_mov_b32_e32 v4, v3
16245; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
16246; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
16247; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16248; GFX908-NEXT:    buffer_wbinvl1
16249; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
16250; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16251; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16252; GFX908-NEXT:    s_cbranch_execnz .LBB66_1
16253; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
16254; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
16255; GFX908-NEXT:    v_mov_b32_e32 v0, v3
16256; GFX908-NEXT:    s_setpc_b64 s[30:31]
16257;
16258; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16259; GFX8:       ; %bb.0:
16260; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16261; GFX8-NEXT:    flat_load_dword v3, v[0:1]
16262; GFX8-NEXT:    s_mov_b64 s[4:5], 0
16263; GFX8-NEXT:  .LBB66_1: ; %atomicrmw.start
16264; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
16265; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16266; GFX8-NEXT:    v_mov_b32_e32 v4, v3
16267; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
16268; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
16269; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
16270; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
16271; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
16272; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
16273; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16274; GFX8-NEXT:    buffer_wbinvl1
16275; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
16276; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16277; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16278; GFX8-NEXT:    s_cbranch_execnz .LBB66_1
16279; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
16280; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
16281; GFX8-NEXT:    v_mov_b32_e32 v0, v3
16282; GFX8-NEXT:    s_setpc_b64 s[30:31]
16283;
16284; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16285; GFX7:       ; %bb.0:
16286; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16287; GFX7-NEXT:    flat_load_dword v5, v[0:1]
16288; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
16289; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
16290; GFX7-NEXT:    s_mov_b64 s[4:5], 0
16291; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
16292; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16293; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
16294; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
16295; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
16296; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
16297; GFX7-NEXT:  .LBB66_1: ; %atomicrmw.start
16298; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
16299; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
16300; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
16301; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
16302; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
16303; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
16304; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
16305; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
16306; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
16307; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
16308; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
16309; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
16310; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
16311; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[6:7] glc
16312; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16313; GFX7-NEXT:    buffer_wbinvl1
16314; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
16315; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v6
16316; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
16317; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
16318; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16319; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16320; GFX7-NEXT:    s_cbranch_execnz .LBB66_1
16321; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
16322; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
16323; GFX7-NEXT:    v_mov_b32_e32 v0, v2
16324; GFX7-NEXT:    v_mov_b32_e32 v1, v3
16325; GFX7-NEXT:    s_setpc_b64 s[30:31]
16326  %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
16327  ret <2 x half> %result
16328}
16329
16330define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 {
16331; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16332; GFX12:       ; %bb.0:
16333; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16334; GFX12-NEXT:    s_wait_expcnt 0x0
16335; GFX12-NEXT:    s_wait_samplecnt 0x0
16336; GFX12-NEXT:    s_wait_bvhcnt 0x0
16337; GFX12-NEXT:    s_wait_kmcnt 0x0
16338; GFX12-NEXT:    s_wait_storecnt 0x0
16339; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV
16340; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
16341; GFX12-NEXT:    global_inv scope:SCOPE_DEV
16342; GFX12-NEXT:    s_setpc_b64 s[30:31]
16343;
16344; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16345; GFX940:       ; %bb.0:
16346; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16347; GFX940-NEXT:    buffer_wbl2 sc1
16348; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
16349; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16350; GFX940-NEXT:    buffer_inv sc1
16351; GFX940-NEXT:    s_setpc_b64 s[30:31]
16352;
16353; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16354; GFX11:       ; %bb.0:
16355; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16356; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
16357; GFX11-NEXT:    s_mov_b32 s0, 0
16358; GFX11-NEXT:  .LBB67_1: ; %atomicrmw.start
16359; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
16360; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16361; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
16362; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
16363; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
16364; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16365; GFX11-NEXT:    buffer_gl1_inv
16366; GFX11-NEXT:    buffer_gl0_inv
16367; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
16368; GFX11-NEXT:    v_mov_b32_e32 v4, v3
16369; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
16370; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
16371; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
16372; GFX11-NEXT:    s_cbranch_execnz .LBB67_1
16373; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
16374; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
16375; GFX11-NEXT:    s_setpc_b64 s[30:31]
16376;
16377; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16378; GFX10:       ; %bb.0:
16379; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16380; GFX10-NEXT:    flat_load_dword v4, v[0:1]
16381; GFX10-NEXT:    s_mov_b32 s4, 0
16382; GFX10-NEXT:  .LBB67_1: ; %atomicrmw.start
16383; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
16384; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16385; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
16386; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
16387; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
16388; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16389; GFX10-NEXT:    buffer_gl1_inv
16390; GFX10-NEXT:    buffer_gl0_inv
16391; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
16392; GFX10-NEXT:    v_mov_b32_e32 v4, v3
16393; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
16394; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
16395; GFX10-NEXT:    s_cbranch_execnz .LBB67_1
16396; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
16397; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
16398; GFX10-NEXT:    s_setpc_b64 s[30:31]
16399;
16400; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16401; GFX90A:       ; %bb.0:
16402; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16403; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
16404; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
16405; GFX90A-NEXT:  .LBB67_1: ; %atomicrmw.start
16406; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
16407; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16408; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
16409; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
16410; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16411; GFX90A-NEXT:    buffer_wbinvl1
16412; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
16413; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16414; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
16415; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16416; GFX90A-NEXT:    s_cbranch_execnz .LBB67_1
16417; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
16418; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
16419; GFX90A-NEXT:    s_setpc_b64 s[30:31]
16420;
16421; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16422; GFX908:       ; %bb.0:
16423; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16424; GFX908-NEXT:    flat_load_dword v4, v[0:1]
16425; GFX908-NEXT:    s_mov_b64 s[4:5], 0
16426; GFX908-NEXT:  .LBB67_1: ; %atomicrmw.start
16427; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
16428; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16429; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
16430; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
16431; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16432; GFX908-NEXT:    buffer_wbinvl1
16433; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
16434; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16435; GFX908-NEXT:    v_mov_b32_e32 v4, v3
16436; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16437; GFX908-NEXT:    s_cbranch_execnz .LBB67_1
16438; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
16439; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
16440; GFX908-NEXT:    s_setpc_b64 s[30:31]
16441;
16442; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16443; GFX8:       ; %bb.0:
16444; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16445; GFX8-NEXT:    flat_load_dword v4, v[0:1]
16446; GFX8-NEXT:    s_mov_b64 s[4:5], 0
16447; GFX8-NEXT:  .LBB67_1: ; %atomicrmw.start
16448; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
16449; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16450; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
16451; GFX8-NEXT:    v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
16452; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
16453; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
16454; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
16455; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
16456; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16457; GFX8-NEXT:    buffer_wbinvl1
16458; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
16459; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16460; GFX8-NEXT:    v_mov_b32_e32 v4, v3
16461; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16462; GFX8-NEXT:    s_cbranch_execnz .LBB67_1
16463; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
16464; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
16465; GFX8-NEXT:    s_setpc_b64 s[30:31]
16466;
16467; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
16468; GFX7:       ; %bb.0:
16469; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16470; GFX7-NEXT:    flat_load_dword v5, v[0:1]
16471; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
16472; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
16473; GFX7-NEXT:    s_mov_b64 s[4:5], 0
16474; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
16475; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16476; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
16477; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
16478; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
16479; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
16480; GFX7-NEXT:  .LBB67_1: ; %atomicrmw.start
16481; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
16482; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
16483; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
16484; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
16485; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
16486; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
16487; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
16488; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
16489; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
16490; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
16491; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
16492; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
16493; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
16494; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
16495; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16496; GFX7-NEXT:    buffer_wbinvl1
16497; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
16498; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
16499; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
16500; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
16501; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16502; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16503; GFX7-NEXT:    s_cbranch_execnz .LBB67_1
16504; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
16505; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
16506; GFX7-NEXT:    s_setpc_b64 s[30:31]
16507  %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
16508  ret void
16509}
16510
16511; --------------------------------------------------------------------
16512; <2 x bfloat>
16513; --------------------------------------------------------------------
16514
16515define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
16516; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
16517; GFX12:       ; %bb.0:
16518; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16519; GFX12-NEXT:    s_wait_expcnt 0x0
16520; GFX12-NEXT:    s_wait_samplecnt 0x0
16521; GFX12-NEXT:    s_wait_bvhcnt 0x0
16522; GFX12-NEXT:    s_wait_kmcnt 0x0
16523; GFX12-NEXT:    s_wait_storecnt 0x0
16524; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
16525; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16526; GFX12-NEXT:    global_inv scope:SCOPE_DEV
16527; GFX12-NEXT:    s_setpc_b64 s[30:31]
16528;
16529; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
16530; GFX940:       ; %bb.0:
16531; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16532; GFX940-NEXT:    buffer_wbl2 sc1
16533; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
16534; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16535; GFX940-NEXT:    buffer_inv sc1
16536; GFX940-NEXT:    s_setpc_b64 s[30:31]
16537;
16538; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
16539; GFX11:       ; %bb.0:
16540; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16541; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
16542; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16543; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16544; GFX11-NEXT:    s_mov_b32 s1, 0
16545; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
16546; GFX11-NEXT:    .p2align 6
16547; GFX11-NEXT:  .LBB68_1: ; %atomicrmw.start
16548; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
16549; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16550; GFX11-NEXT:    v_mov_b32_e32 v6, v3
16551; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16552; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16553; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
16554; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
16555; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16556; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
16557; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
16558; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16559; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
16560; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
16561; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
16562; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
16563; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
16564; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
16565; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
16566; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
16567; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
16568; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16569; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
16570; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
16571; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
16572; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
16573; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16574; GFX11-NEXT:    buffer_gl1_inv
16575; GFX11-NEXT:    buffer_gl0_inv
16576; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
16577; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
16578; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
16579; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
16580; GFX11-NEXT:    s_cbranch_execnz .LBB68_1
16581; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
16582; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
16583; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
16584; GFX11-NEXT:    v_mov_b32_e32 v0, v3
16585; GFX11-NEXT:    s_setpc_b64 s[30:31]
16586;
16587; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
16588; GFX10:       ; %bb.0:
16589; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16590; GFX10-NEXT:    flat_load_dword v3, v[0:1]
16591; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16592; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16593; GFX10-NEXT:    s_mov_b32 s5, 0
16594; GFX10-NEXT:  .LBB68_1: ; %atomicrmw.start
16595; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
16596; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16597; GFX10-NEXT:    v_mov_b32_e32 v6, v3
16598; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
16599; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16600; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
16601; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
16602; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
16603; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
16604; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
16605; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16606; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
16607; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
16608; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
16609; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
16610; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
16611; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
16612; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
16613; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
16614; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
16615; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16616; GFX10-NEXT:    buffer_gl1_inv
16617; GFX10-NEXT:    buffer_gl0_inv
16618; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
16619; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
16620; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
16621; GFX10-NEXT:    s_cbranch_execnz .LBB68_1
16622; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
16623; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
16624; GFX10-NEXT:    v_mov_b32_e32 v0, v3
16625; GFX10-NEXT:    s_setpc_b64 s[30:31]
16626;
16627; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
16628; GFX90A:       ; %bb.0:
16629; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16630; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
16631; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
16632; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16633; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
16634; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16635; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
16636; GFX90A-NEXT:  .LBB68_1: ; %atomicrmw.start
16637; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
16638; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16639; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
16640; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
16641; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
16642; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
16643; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
16644; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
16645; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
16646; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
16647; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16648; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
16649; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
16650; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16651; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
16652; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
16653; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
16654; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
16655; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
16656; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16657; GFX90A-NEXT:    buffer_wbinvl1
16658; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
16659; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16660; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16661; GFX90A-NEXT:    s_cbranch_execnz .LBB68_1
16662; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
16663; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
16664; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
16665; GFX90A-NEXT:    s_setpc_b64 s[30:31]
16666;
16667; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
16668; GFX908:       ; %bb.0:
16669; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16670; GFX908-NEXT:    flat_load_dword v3, v[0:1]
16671; GFX908-NEXT:    s_mov_b64 s[6:7], 0
16672; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16673; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
16674; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16675; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
16676; GFX908-NEXT:  .LBB68_1: ; %atomicrmw.start
16677; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
16678; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16679; GFX908-NEXT:    v_mov_b32_e32 v6, v3
16680; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
16681; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16682; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
16683; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
16684; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
16685; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
16686; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
16687; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16688; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
16689; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
16690; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16691; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
16692; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
16693; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
16694; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
16695; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
16696; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16697; GFX908-NEXT:    buffer_wbinvl1
16698; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
16699; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16700; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16701; GFX908-NEXT:    s_cbranch_execnz .LBB68_1
16702; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
16703; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
16704; GFX908-NEXT:    v_mov_b32_e32 v0, v3
16705; GFX908-NEXT:    s_setpc_b64 s[30:31]
16706;
16707; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
16708; GFX8:       ; %bb.0:
16709; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16710; GFX8-NEXT:    flat_load_dword v3, v[0:1]
16711; GFX8-NEXT:    s_mov_b64 s[6:7], 0
16712; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16713; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16714; GFX8-NEXT:  .LBB68_1: ; %atomicrmw.start
16715; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
16716; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16717; GFX8-NEXT:    v_mov_b32_e32 v6, v3
16718; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
16719; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16720; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
16721; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
16722; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
16723; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
16724; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
16725; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
16726; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
16727; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
16728; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16729; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16730; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
16731; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
16732; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
16733; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
16734; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
16735; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
16736; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
16737; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16738; GFX8-NEXT:    buffer_wbinvl1
16739; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
16740; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16741; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16742; GFX8-NEXT:    s_cbranch_execnz .LBB68_1
16743; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
16744; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
16745; GFX8-NEXT:    v_mov_b32_e32 v0, v3
16746; GFX8-NEXT:    s_setpc_b64 s[30:31]
16747;
16748; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
16749; GFX7:       ; %bb.0:
16750; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16751; GFX7-NEXT:    flat_load_dword v5, v[0:1]
16752; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
16753; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
16754; GFX7-NEXT:    s_mov_b64 s[4:5], 0
16755; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
16756; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16757; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
16758; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
16759; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16760; GFX7-NEXT:  .LBB68_1: ; %atomicrmw.start
16761; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
16762; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
16763; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
16764; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
16765; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16766; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
16767; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
16768; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
16769; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
16770; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
16771; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
16772; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
16773; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16774; GFX7-NEXT:    buffer_wbinvl1
16775; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
16776; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
16777; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16778; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
16779; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16780; GFX7-NEXT:    s_cbranch_execnz .LBB68_1
16781; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
16782; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
16783; GFX7-NEXT:    v_mov_b32_e32 v0, v3
16784; GFX7-NEXT:    v_mov_b32_e32 v1, v2
16785; GFX7-NEXT:    s_setpc_b64 s[30:31]
16786  %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
16787  ret <2 x bfloat> %result
16788}
16789
16790define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
16791; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16792; GFX12:       ; %bb.0:
16793; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16794; GFX12-NEXT:    s_wait_expcnt 0x0
16795; GFX12-NEXT:    s_wait_samplecnt 0x0
16796; GFX12-NEXT:    s_wait_bvhcnt 0x0
16797; GFX12-NEXT:    s_wait_kmcnt 0x0
16798; GFX12-NEXT:    s_wait_storecnt 0x0
16799; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
16800; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16801; GFX12-NEXT:    global_inv scope:SCOPE_DEV
16802; GFX12-NEXT:    s_setpc_b64 s[30:31]
16803;
16804; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16805; GFX940:       ; %bb.0:
16806; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16807; GFX940-NEXT:    buffer_wbl2 sc1
16808; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0
16809; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16810; GFX940-NEXT:    buffer_inv sc1
16811; GFX940-NEXT:    s_setpc_b64 s[30:31]
16812;
16813; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16814; GFX11:       ; %bb.0:
16815; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16816; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
16817; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16818; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16819; GFX11-NEXT:    s_mov_b32 s1, 0
16820; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
16821; GFX11-NEXT:    .p2align 6
16822; GFX11-NEXT:  .LBB69_1: ; %atomicrmw.start
16823; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
16824; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16825; GFX11-NEXT:    v_mov_b32_e32 v6, v3
16826; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16827; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16828; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
16829; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
16830; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16831; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
16832; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
16833; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16834; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
16835; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
16836; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
16837; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
16838; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
16839; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
16840; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
16841; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
16842; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
16843; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16844; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
16845; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
16846; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
16847; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
16848; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16849; GFX11-NEXT:    buffer_gl1_inv
16850; GFX11-NEXT:    buffer_gl0_inv
16851; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
16852; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
16853; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
16854; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
16855; GFX11-NEXT:    s_cbranch_execnz .LBB69_1
16856; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
16857; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
16858; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
16859; GFX11-NEXT:    v_mov_b32_e32 v0, v3
16860; GFX11-NEXT:    s_setpc_b64 s[30:31]
16861;
16862; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16863; GFX10:       ; %bb.0:
16864; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16865; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
16866; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
16867; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
16868; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16869; GFX10-NEXT:    s_mov_b32 s5, 0
16870; GFX10-NEXT:    flat_load_dword v0, v[3:4]
16871; GFX10-NEXT:  .LBB69_1: ; %atomicrmw.start
16872; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
16873; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16874; GFX10-NEXT:    v_mov_b32_e32 v6, v0
16875; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
16876; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16877; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
16878; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
16879; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
16880; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
16881; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
16882; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16883; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
16884; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
16885; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
16886; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
16887; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
16888; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
16889; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
16890; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
16891; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
16892; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16893; GFX10-NEXT:    buffer_gl1_inv
16894; GFX10-NEXT:    buffer_gl0_inv
16895; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
16896; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
16897; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
16898; GFX10-NEXT:    s_cbranch_execnz .LBB69_1
16899; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
16900; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
16901; GFX10-NEXT:    s_setpc_b64 s[30:31]
16902;
16903; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16904; GFX90A:       ; %bb.0:
16905; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16906; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
16907; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
16908; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16909; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
16910; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16911; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
16912; GFX90A-NEXT:  .LBB69_1: ; %atomicrmw.start
16913; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
16914; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16915; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
16916; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
16917; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
16918; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
16919; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
16920; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
16921; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
16922; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
16923; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16924; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
16925; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
16926; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16927; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
16928; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
16929; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
16930; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
16931; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
16932; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16933; GFX90A-NEXT:    buffer_wbinvl1
16934; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
16935; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16936; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16937; GFX90A-NEXT:    s_cbranch_execnz .LBB69_1
16938; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
16939; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
16940; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
16941; GFX90A-NEXT:    s_setpc_b64 s[30:31]
16942;
16943; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16944; GFX908:       ; %bb.0:
16945; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16946; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
16947; GFX908-NEXT:    s_mov_b64 s[6:7], 0
16948; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16949; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
16950; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16951; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
16952; GFX908-NEXT:  .LBB69_1: ; %atomicrmw.start
16953; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
16954; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16955; GFX908-NEXT:    v_mov_b32_e32 v6, v3
16956; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
16957; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16958; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
16959; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
16960; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
16961; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
16962; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
16963; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16964; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
16965; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
16966; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16967; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
16968; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
16969; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
16970; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
16971; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
16972; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16973; GFX908-NEXT:    buffer_wbinvl1
16974; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
16975; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16976; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16977; GFX908-NEXT:    s_cbranch_execnz .LBB69_1
16978; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
16979; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
16980; GFX908-NEXT:    v_mov_b32_e32 v0, v3
16981; GFX908-NEXT:    s_setpc_b64 s[30:31]
16982;
16983; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16984; GFX8:       ; %bb.0:
16985; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16986; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
16987; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
16988; GFX8-NEXT:    flat_load_dword v0, v[3:4]
16989; GFX8-NEXT:    s_mov_b64 s[6:7], 0
16990; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
16991; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16992; GFX8-NEXT:  .LBB69_1: ; %atomicrmw.start
16993; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
16994; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16995; GFX8-NEXT:    v_mov_b32_e32 v6, v0
16996; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
16997; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16998; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
16999; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
17000; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
17001; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
17002; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
17003; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
17004; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
17005; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
17006; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
17007; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
17008; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
17009; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
17010; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
17011; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
17012; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
17013; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
17014; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
17015; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17016; GFX8-NEXT:    buffer_wbinvl1
17017; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
17018; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
17019; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
17020; GFX8-NEXT:    s_cbranch_execnz .LBB69_1
17021; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
17022; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
17023; GFX8-NEXT:    s_setpc_b64 s[30:31]
17024;
17025; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
17026; GFX7:       ; %bb.0:
17027; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17028; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
17029; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
17030; GFX7-NEXT:    flat_load_dword v0, v[4:5]
17031; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
17032; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
17033; GFX7-NEXT:    s_mov_b64 s[4:5], 0
17034; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
17035; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
17036; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17037; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
17038; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
17039; GFX7-NEXT:  .LBB69_1: ; %atomicrmw.start
17040; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
17041; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
17042; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17043; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
17044; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
17045; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
17046; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
17047; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
17048; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
17049; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
17050; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
17051; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
17052; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17053; GFX7-NEXT:    buffer_wbinvl1
17054; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
17055; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
17056; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
17057; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
17058; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
17059; GFX7-NEXT:    s_cbranch_execnz .LBB69_1
17060; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
17061; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
17062; GFX7-NEXT:    s_setpc_b64 s[30:31]
17063  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
17064  %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
17065  ret <2 x bfloat> %result
17066}
17067
17068define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
17069; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
17070; GFX12:       ; %bb.0:
17071; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
17072; GFX12-NEXT:    s_wait_expcnt 0x0
17073; GFX12-NEXT:    s_wait_samplecnt 0x0
17074; GFX12-NEXT:    s_wait_bvhcnt 0x0
17075; GFX12-NEXT:    s_wait_kmcnt 0x0
17076; GFX12-NEXT:    s_wait_storecnt 0x0
17077; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
17078; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
17079; GFX12-NEXT:    global_inv scope:SCOPE_DEV
17080; GFX12-NEXT:    s_setpc_b64 s[30:31]
17081;
17082; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
17083; GFX940:       ; %bb.0:
17084; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17085; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
17086; GFX940-NEXT:    s_nop 1
17087; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
17088; GFX940-NEXT:    buffer_wbl2 sc1
17089; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
17090; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17091; GFX940-NEXT:    buffer_inv sc1
17092; GFX940-NEXT:    s_setpc_b64 s[30:31]
17093;
17094; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
17095; GFX11:       ; %bb.0:
17096; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17097; GFX11-NEXT:    v_mov_b32_e32 v3, v0
17098; GFX11-NEXT:    s_mov_b32 s1, 0
17099; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
17100; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
17101; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
17102; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
17103; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
17104; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
17105; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
17106; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17107; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
17108; GFX11-NEXT:    .p2align 6
17109; GFX11-NEXT:  .LBB70_1: ; %atomicrmw.start
17110; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
17111; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17112; GFX11-NEXT:    v_mov_b32_e32 v6, v0
17113; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17114; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
17115; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
17116; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
17117; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17118; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
17119; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
17120; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
17121; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
17122; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
17123; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
17124; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
17125; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
17126; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
17127; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
17128; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
17129; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
17130; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17131; GFX11-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s0
17132; GFX11-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
17133; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
17134; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
17135; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17136; GFX11-NEXT:    buffer_gl1_inv
17137; GFX11-NEXT:    buffer_gl0_inv
17138; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
17139; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
17140; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
17141; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
17142; GFX11-NEXT:    s_cbranch_execnz .LBB70_1
17143; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
17144; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
17145; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
17146; GFX11-NEXT:    s_setpc_b64 s[30:31]
17147;
17148; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
17149; GFX10:       ; %bb.0:
17150; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17151; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
17152; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
17153; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
17154; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17155; GFX10-NEXT:    s_mov_b32 s5, 0
17156; GFX10-NEXT:    flat_load_dword v0, v[3:4]
17157; GFX10-NEXT:  .LBB70_1: ; %atomicrmw.start
17158; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
17159; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17160; GFX10-NEXT:    v_mov_b32_e32 v6, v0
17161; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
17162; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
17163; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
17164; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
17165; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
17166; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
17167; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
17168; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
17169; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
17170; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
17171; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
17172; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
17173; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
17174; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
17175; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
17176; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
17177; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
17178; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17179; GFX10-NEXT:    buffer_gl1_inv
17180; GFX10-NEXT:    buffer_gl0_inv
17181; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
17182; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
17183; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
17184; GFX10-NEXT:    s_cbranch_execnz .LBB70_1
17185; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
17186; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
17187; GFX10-NEXT:    s_setpc_b64 s[30:31]
17188;
17189; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
17190; GFX90A:       ; %bb.0:
17191; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17192; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
17193; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
17194; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
17195; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
17196; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
17197; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
17198; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
17199; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
17200; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17201; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
17202; GFX90A-NEXT:  .LBB70_1: ; %atomicrmw.start
17203; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
17204; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17205; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
17206; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
17207; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
17208; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v1
17209; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v2
17210; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
17211; GFX90A-NEXT:    v_bfe_u32 v9, v3, 16, 1
17212; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
17213; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v3
17214; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
17215; GFX90A-NEXT:    v_add3_u32 v9, v9, v3, s8
17216; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
17217; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
17218; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[4:5]
17219; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
17220; GFX90A-NEXT:    v_perm_b32 v6, v3, v0, s9
17221; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
17222; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17223; GFX90A-NEXT:    buffer_wbinvl1
17224; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
17225; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
17226; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
17227; GFX90A-NEXT:    s_cbranch_execnz .LBB70_1
17228; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
17229; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
17230; GFX90A-NEXT:    s_setpc_b64 s[30:31]
17231;
17232; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
17233; GFX908:       ; %bb.0:
17234; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17235; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
17236; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
17237; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
17238; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
17239; GFX908-NEXT:    flat_load_dword v0, v[0:1]
17240; GFX908-NEXT:    s_mov_b64 s[6:7], 0
17241; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
17242; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
17243; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17244; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
17245; GFX908-NEXT:  .LBB70_1: ; %atomicrmw.start
17246; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
17247; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17248; GFX908-NEXT:    v_mov_b32_e32 v6, v0
17249; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
17250; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
17251; GFX908-NEXT:    v_add_f32_e32 v0, v0, v1
17252; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
17253; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
17254; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
17255; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
17256; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
17257; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
17258; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
17259; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
17260; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
17261; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
17262; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
17263; GFX908-NEXT:    v_perm_b32 v5, v5, v0, s9
17264; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
17265; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17266; GFX908-NEXT:    buffer_wbinvl1
17267; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
17268; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
17269; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
17270; GFX908-NEXT:    s_cbranch_execnz .LBB70_1
17271; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
17272; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
17273; GFX908-NEXT:    s_setpc_b64 s[30:31]
17274;
17275; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
17276; GFX8:       ; %bb.0:
17277; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17278; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
17279; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
17280; GFX8-NEXT:    flat_load_dword v0, v[3:4]
17281; GFX8-NEXT:    s_mov_b64 s[6:7], 0
17282; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
17283; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17284; GFX8-NEXT:  .LBB70_1: ; %atomicrmw.start
17285; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
17286; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17287; GFX8-NEXT:    v_mov_b32_e32 v6, v0
17288; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
17289; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
17290; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
17291; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
17292; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
17293; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
17294; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
17295; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
17296; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
17297; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
17298; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
17299; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
17300; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
17301; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
17302; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
17303; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
17304; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
17305; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
17306; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
17307; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17308; GFX8-NEXT:    buffer_wbinvl1
17309; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
17310; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
17311; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
17312; GFX8-NEXT:    s_cbranch_execnz .LBB70_1
17313; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
17314; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
17315; GFX8-NEXT:    s_setpc_b64 s[30:31]
17316;
17317; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
17318; GFX7:       ; %bb.0:
17319; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17320; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
17321; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
17322; GFX7-NEXT:    flat_load_dword v0, v[4:5]
17323; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
17324; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
17325; GFX7-NEXT:    s_mov_b64 s[4:5], 0
17326; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
17327; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
17328; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17329; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
17330; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
17331; GFX7-NEXT:  .LBB70_1: ; %atomicrmw.start
17332; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
17333; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
17334; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
17335; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
17336; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
17337; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
17338; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
17339; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
17340; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
17341; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
17342; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
17343; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
17344; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17345; GFX7-NEXT:    buffer_wbinvl1
17346; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
17347; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
17348; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
17349; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
17350; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
17351; GFX7-NEXT:    s_cbranch_execnz .LBB70_1
17352; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
17353; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
17354; GFX7-NEXT:    s_setpc_b64 s[30:31]
17355  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
17356  %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
17357  ret <2 x bfloat> %result
17358}
17359
17360define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
17361; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
17362; GFX12:       ; %bb.0:
17363; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
17364; GFX12-NEXT:    s_wait_expcnt 0x0
17365; GFX12-NEXT:    s_wait_samplecnt 0x0
17366; GFX12-NEXT:    s_wait_bvhcnt 0x0
17367; GFX12-NEXT:    s_wait_kmcnt 0x0
17368; GFX12-NEXT:    s_wait_storecnt 0x0
17369; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV
17370; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
17371; GFX12-NEXT:    global_inv scope:SCOPE_DEV
17372; GFX12-NEXT:    s_setpc_b64 s[30:31]
17373;
17374; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
17375; GFX940:       ; %bb.0:
17376; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17377; GFX940-NEXT:    buffer_wbl2 sc1
17378; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
17379; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17380; GFX940-NEXT:    buffer_inv sc1
17381; GFX940-NEXT:    s_setpc_b64 s[30:31]
17382;
17383; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
17384; GFX11:       ; %bb.0:
17385; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17386; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
17387; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17388; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
17389; GFX11-NEXT:    s_mov_b32 s1, 0
17390; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
17391; GFX11-NEXT:    .p2align 6
17392; GFX11-NEXT:  .LBB71_1: ; %atomicrmw.start
17393; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
17394; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17395; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
17396; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
17397; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17398; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
17399; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
17400; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17401; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
17402; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
17403; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
17404; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
17405; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
17406; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
17407; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
17408; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
17409; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17410; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
17411; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
17412; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
17413; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
17414; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
17415; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
17416; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17417; GFX11-NEXT:    buffer_gl1_inv
17418; GFX11-NEXT:    buffer_gl0_inv
17419; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
17420; GFX11-NEXT:    v_mov_b32_e32 v3, v2
17421; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
17422; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
17423; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
17424; GFX11-NEXT:    s_cbranch_execnz .LBB71_1
17425; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
17426; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
17427; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
17428; GFX11-NEXT:    s_setpc_b64 s[30:31]
17429;
17430; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
17431; GFX10:       ; %bb.0:
17432; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17433; GFX10-NEXT:    flat_load_dword v3, v[0:1]
17434; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17435; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
17436; GFX10-NEXT:    s_mov_b32 s5, 0
17437; GFX10-NEXT:  .LBB71_1: ; %atomicrmw.start
17438; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
17439; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17440; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
17441; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
17442; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
17443; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
17444; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
17445; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
17446; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
17447; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
17448; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
17449; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
17450; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
17451; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
17452; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
17453; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
17454; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
17455; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
17456; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17457; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17458; GFX10-NEXT:    buffer_gl1_inv
17459; GFX10-NEXT:    buffer_gl0_inv
17460; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
17461; GFX10-NEXT:    v_mov_b32_e32 v3, v2
17462; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
17463; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
17464; GFX10-NEXT:    s_cbranch_execnz .LBB71_1
17465; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
17466; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
17467; GFX10-NEXT:    s_setpc_b64 s[30:31]
17468;
17469; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
17470; GFX90A:       ; %bb.0:
17471; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17472; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
17473; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
17474; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17475; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
17476; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
17477; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
17478; GFX90A-NEXT:  .LBB71_1: ; %atomicrmw.start
17479; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
17480; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17481; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
17482; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
17483; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
17484; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
17485; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
17486; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
17487; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
17488; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
17489; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
17490; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
17491; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
17492; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
17493; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
17494; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
17495; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
17496; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17497; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17498; GFX90A-NEXT:    buffer_wbinvl1
17499; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
17500; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
17501; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
17502; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
17503; GFX90A-NEXT:    s_cbranch_execnz .LBB71_1
17504; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
17505; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
17506; GFX90A-NEXT:    s_setpc_b64 s[30:31]
17507;
17508; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
17509; GFX908:       ; %bb.0:
17510; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17511; GFX908-NEXT:    flat_load_dword v3, v[0:1]
17512; GFX908-NEXT:    s_mov_b64 s[6:7], 0
17513; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17514; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
17515; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
17516; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
17517; GFX908-NEXT:  .LBB71_1: ; %atomicrmw.start
17518; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
17519; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17520; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
17521; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
17522; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
17523; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
17524; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
17525; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
17526; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
17527; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
17528; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
17529; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
17530; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
17531; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
17532; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
17533; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
17534; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
17535; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17536; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17537; GFX908-NEXT:    buffer_wbinvl1
17538; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
17539; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
17540; GFX908-NEXT:    v_mov_b32_e32 v3, v2
17541; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
17542; GFX908-NEXT:    s_cbranch_execnz .LBB71_1
17543; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
17544; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
17545; GFX908-NEXT:    s_setpc_b64 s[30:31]
17546;
17547; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
17548; GFX8:       ; %bb.0:
17549; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17550; GFX8-NEXT:    flat_load_dword v3, v[0:1]
17551; GFX8-NEXT:    s_mov_b64 s[6:7], 0
17552; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17553; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
17554; GFX8-NEXT:  .LBB71_1: ; %atomicrmw.start
17555; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
17556; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17557; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
17558; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
17559; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
17560; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
17561; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
17562; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
17563; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
17564; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
17565; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
17566; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
17567; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
17568; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
17569; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
17570; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
17571; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
17572; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
17573; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
17574; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
17575; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17576; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17577; GFX8-NEXT:    buffer_wbinvl1
17578; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
17579; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
17580; GFX8-NEXT:    v_mov_b32_e32 v3, v2
17581; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
17582; GFX8-NEXT:    s_cbranch_execnz .LBB71_1
17583; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
17584; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
17585; GFX8-NEXT:    s_setpc_b64 s[30:31]
17586;
17587; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
17588; GFX7:       ; %bb.0:
17589; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17590; GFX7-NEXT:    flat_load_dword v5, v[0:1]
17591; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
17592; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
17593; GFX7-NEXT:    s_mov_b64 s[4:5], 0
17594; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17595; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
17596; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17597; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
17598; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
17599; GFX7-NEXT:  .LBB71_1: ; %atomicrmw.start
17600; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
17601; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
17602; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
17603; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
17604; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
17605; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
17606; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
17607; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
17608; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
17609; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
17610; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
17611; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
17612; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17613; GFX7-NEXT:    buffer_wbinvl1
17614; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
17615; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
17616; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
17617; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
17618; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
17619; GFX7-NEXT:    s_cbranch_execnz .LBB71_1
17620; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
17621; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
17622; GFX7-NEXT:    s_setpc_b64 s[30:31]
17623  %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
17624  ret void
17625}
17626
17627define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
17628; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
17629; GFX12:       ; %bb.0:
17630; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
17631; GFX12-NEXT:    s_wait_expcnt 0x0
17632; GFX12-NEXT:    s_wait_samplecnt 0x0
17633; GFX12-NEXT:    s_wait_bvhcnt 0x0
17634; GFX12-NEXT:    s_wait_kmcnt 0x0
17635; GFX12-NEXT:    s_wait_storecnt 0x0
17636; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_DEV
17637; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
17638; GFX12-NEXT:    global_inv scope:SCOPE_DEV
17639; GFX12-NEXT:    s_setpc_b64 s[30:31]
17640;
17641; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
17642; GFX940:       ; %bb.0:
17643; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17644; GFX940-NEXT:    buffer_wbl2 sc1
17645; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
17646; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17647; GFX940-NEXT:    buffer_inv sc1
17648; GFX940-NEXT:    s_setpc_b64 s[30:31]
17649;
17650; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
17651; GFX11:       ; %bb.0:
17652; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17653; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
17654; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17655; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
17656; GFX11-NEXT:    s_mov_b32 s1, 0
17657; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
17658; GFX11-NEXT:    .p2align 6
17659; GFX11-NEXT:  .LBB72_1: ; %atomicrmw.start
17660; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
17661; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17662; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
17663; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
17664; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17665; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
17666; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
17667; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17668; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
17669; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
17670; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
17671; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
17672; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
17673; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
17674; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
17675; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
17676; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17677; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
17678; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
17679; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
17680; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
17681; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
17682; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
17683; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17684; GFX11-NEXT:    buffer_gl1_inv
17685; GFX11-NEXT:    buffer_gl0_inv
17686; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
17687; GFX11-NEXT:    v_mov_b32_e32 v3, v2
17688; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
17689; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
17690; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
17691; GFX11-NEXT:    s_cbranch_execnz .LBB72_1
17692; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
17693; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
17694; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
17695; GFX11-NEXT:    s_setpc_b64 s[30:31]
17696;
17697; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
17698; GFX10:       ; %bb.0:
17699; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17700; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
17701; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
17702; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17703; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
17704; GFX10-NEXT:    s_mov_b32 s5, 0
17705; GFX10-NEXT:    flat_load_dword v3, v[0:1]
17706; GFX10-NEXT:  .LBB72_1: ; %atomicrmw.start
17707; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
17708; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17709; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
17710; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
17711; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
17712; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
17713; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
17714; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
17715; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
17716; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
17717; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
17718; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
17719; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
17720; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
17721; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
17722; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
17723; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
17724; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
17725; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17726; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17727; GFX10-NEXT:    buffer_gl1_inv
17728; GFX10-NEXT:    buffer_gl0_inv
17729; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
17730; GFX10-NEXT:    v_mov_b32_e32 v3, v2
17731; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
17732; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
17733; GFX10-NEXT:    s_cbranch_execnz .LBB72_1
17734; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
17735; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
17736; GFX10-NEXT:    s_setpc_b64 s[30:31]
17737;
17738; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
17739; GFX90A:       ; %bb.0:
17740; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17741; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
17742; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
17743; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17744; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
17745; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
17746; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
17747; GFX90A-NEXT:  .LBB72_1: ; %atomicrmw.start
17748; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
17749; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17750; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
17751; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
17752; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
17753; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
17754; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
17755; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
17756; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
17757; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
17758; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
17759; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
17760; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
17761; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
17762; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
17763; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
17764; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
17765; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
17766; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17767; GFX90A-NEXT:    buffer_wbinvl1
17768; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
17769; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
17770; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
17771; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
17772; GFX90A-NEXT:    s_cbranch_execnz .LBB72_1
17773; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
17774; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
17775; GFX90A-NEXT:    s_setpc_b64 s[30:31]
17776;
17777; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
17778; GFX908:       ; %bb.0:
17779; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17780; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
17781; GFX908-NEXT:    s_mov_b64 s[6:7], 0
17782; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17783; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
17784; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
17785; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
17786; GFX908-NEXT:  .LBB72_1: ; %atomicrmw.start
17787; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
17788; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17789; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
17790; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
17791; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
17792; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
17793; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
17794; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
17795; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
17796; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
17797; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
17798; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
17799; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
17800; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
17801; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
17802; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
17803; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
17804; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
17805; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17806; GFX908-NEXT:    buffer_wbinvl1
17807; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
17808; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
17809; GFX908-NEXT:    v_mov_b32_e32 v3, v2
17810; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
17811; GFX908-NEXT:    s_cbranch_execnz .LBB72_1
17812; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
17813; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
17814; GFX908-NEXT:    s_setpc_b64 s[30:31]
17815;
17816; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
17817; GFX8:       ; %bb.0:
17818; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17819; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
17820; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
17821; GFX8-NEXT:    flat_load_dword v3, v[0:1]
17822; GFX8-NEXT:    s_mov_b64 s[6:7], 0
17823; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17824; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
17825; GFX8-NEXT:  .LBB72_1: ; %atomicrmw.start
17826; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
17827; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17828; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
17829; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
17830; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
17831; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
17832; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
17833; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
17834; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
17835; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
17836; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
17837; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
17838; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
17839; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
17840; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
17841; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
17842; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
17843; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
17844; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
17845; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
17846; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17847; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17848; GFX8-NEXT:    buffer_wbinvl1
17849; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
17850; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
17851; GFX8-NEXT:    v_mov_b32_e32 v3, v2
17852; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
17853; GFX8-NEXT:    s_cbranch_execnz .LBB72_1
17854; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
17855; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
17856; GFX8-NEXT:    s_setpc_b64 s[30:31]
17857;
17858; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
17859; GFX7:       ; %bb.0:
17860; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17861; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
17862; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
17863; GFX7-NEXT:    flat_load_dword v5, v[0:1]
17864; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
17865; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
17866; GFX7-NEXT:    s_mov_b64 s[4:5], 0
17867; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
17868; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
17869; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17870; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
17871; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
17872; GFX7-NEXT:  .LBB72_1: ; %atomicrmw.start
17873; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
17874; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
17875; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
17876; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
17877; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
17878; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
17879; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
17880; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
17881; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
17882; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
17883; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
17884; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
17885; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17886; GFX7-NEXT:    buffer_wbinvl1
17887; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
17888; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
17889; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
17890; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
17891; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
17892; GFX7-NEXT:    s_cbranch_execnz .LBB72_1
17893; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
17894; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
17895; GFX7-NEXT:    s_setpc_b64 s[30:31]
17896  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
17897  %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
17898  ret void
17899}
17900
17901define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
17902; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
17903; GFX12:       ; %bb.0:
17904; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
17905; GFX12-NEXT:    s_wait_expcnt 0x0
17906; GFX12-NEXT:    s_wait_samplecnt 0x0
17907; GFX12-NEXT:    s_wait_bvhcnt 0x0
17908; GFX12-NEXT:    s_wait_kmcnt 0x0
17909; GFX12-NEXT:    s_wait_storecnt 0x0
17910; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
17911; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
17912; GFX12-NEXT:    global_inv scope:SCOPE_DEV
17913; GFX12-NEXT:    s_setpc_b64 s[30:31]
17914;
17915; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
17916; GFX940:       ; %bb.0:
17917; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17918; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
17919; GFX940-NEXT:    s_nop 1
17920; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
17921; GFX940-NEXT:    buffer_wbl2 sc1
17922; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
17923; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17924; GFX940-NEXT:    buffer_inv sc1
17925; GFX940-NEXT:    s_setpc_b64 s[30:31]
17926;
17927; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
17928; GFX11:       ; %bb.0:
17929; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17930; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
17931; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
17932; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
17933; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
17934; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
17935; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17936; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
17937; GFX11-NEXT:    s_mov_b32 s1, 0
17938; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
17939; GFX11-NEXT:    .p2align 6
17940; GFX11-NEXT:  .LBB73_1: ; %atomicrmw.start
17941; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
17942; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17943; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
17944; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
17945; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17946; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
17947; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
17948; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17949; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
17950; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
17951; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
17952; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
17953; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
17954; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
17955; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
17956; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
17957; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
17958; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
17959; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
17960; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
17961; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
17962; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
17963; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
17964; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17965; GFX11-NEXT:    buffer_gl1_inv
17966; GFX11-NEXT:    buffer_gl0_inv
17967; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
17968; GFX11-NEXT:    v_mov_b32_e32 v3, v2
17969; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
17970; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
17971; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
17972; GFX11-NEXT:    s_cbranch_execnz .LBB73_1
17973; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
17974; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
17975; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
17976; GFX11-NEXT:    s_setpc_b64 s[30:31]
17977;
17978; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
17979; GFX10:       ; %bb.0:
17980; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17981; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
17982; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
17983; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
17984; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
17985; GFX10-NEXT:    s_mov_b32 s5, 0
17986; GFX10-NEXT:    flat_load_dword v3, v[0:1]
17987; GFX10-NEXT:  .LBB73_1: ; %atomicrmw.start
17988; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
17989; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
17990; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
17991; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
17992; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
17993; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
17994; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
17995; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
17996; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
17997; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
17998; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
17999; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
18000; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
18001; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
18002; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
18003; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
18004; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
18005; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
18006; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18007; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18008; GFX10-NEXT:    buffer_gl1_inv
18009; GFX10-NEXT:    buffer_gl0_inv
18010; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
18011; GFX10-NEXT:    v_mov_b32_e32 v3, v2
18012; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
18013; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
18014; GFX10-NEXT:    s_cbranch_execnz .LBB73_1
18015; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
18016; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
18017; GFX10-NEXT:    s_setpc_b64 s[30:31]
18018;
18019; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
18020; GFX90A:       ; %bb.0:
18021; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18022; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
18023; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
18024; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
18025; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
18026; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
18027; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
18028; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
18029; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
18030; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18031; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
18032; GFX90A-NEXT:  .LBB73_1: ; %atomicrmw.start
18033; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
18034; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18035; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
18036; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
18037; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v3
18038; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v2
18039; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
18040; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
18041; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
18042; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
18043; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s8
18044; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
18045; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
18046; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
18047; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
18048; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
18049; GFX90A-NEXT:    v_perm_b32 v0, v6, v0, s9
18050; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
18051; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18052; GFX90A-NEXT:    buffer_wbinvl1
18053; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
18054; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
18055; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
18056; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
18057; GFX90A-NEXT:    s_cbranch_execnz .LBB73_1
18058; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
18059; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
18060; GFX90A-NEXT:    s_setpc_b64 s[30:31]
18061;
18062; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
18063; GFX908:       ; %bb.0:
18064; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18065; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
18066; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
18067; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
18068; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
18069; GFX908-NEXT:    flat_load_dword v1, v[0:1]
18070; GFX908-NEXT:    s_mov_b64 s[6:7], 0
18071; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
18072; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
18073; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18074; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
18075; GFX908-NEXT:  .LBB73_1: ; %atomicrmw.start
18076; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
18077; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18078; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
18079; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
18080; GFX908-NEXT:    v_add_f32_e32 v0, v0, v5
18081; GFX908-NEXT:    v_add_f32_e32 v6, v6, v2
18082; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
18083; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
18084; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
18085; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
18086; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
18087; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
18088; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
18089; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
18090; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
18091; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
18092; GFX908-NEXT:    v_perm_b32 v0, v6, v0, s9
18093; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
18094; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18095; GFX908-NEXT:    buffer_wbinvl1
18096; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
18097; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
18098; GFX908-NEXT:    v_mov_b32_e32 v1, v0
18099; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
18100; GFX908-NEXT:    s_cbranch_execnz .LBB73_1
18101; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
18102; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
18103; GFX908-NEXT:    s_setpc_b64 s[30:31]
18104;
18105; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
18106; GFX8:       ; %bb.0:
18107; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18108; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
18109; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
18110; GFX8-NEXT:    flat_load_dword v3, v[0:1]
18111; GFX8-NEXT:    s_mov_b64 s[6:7], 0
18112; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18113; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
18114; GFX8-NEXT:  .LBB73_1: ; %atomicrmw.start
18115; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
18116; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18117; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
18118; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
18119; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
18120; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
18121; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
18122; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
18123; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
18124; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
18125; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
18126; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
18127; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
18128; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
18129; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
18130; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
18131; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
18132; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
18133; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
18134; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
18135; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18136; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18137; GFX8-NEXT:    buffer_wbinvl1
18138; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
18139; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
18140; GFX8-NEXT:    v_mov_b32_e32 v3, v2
18141; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
18142; GFX8-NEXT:    s_cbranch_execnz .LBB73_1
18143; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
18144; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
18145; GFX8-NEXT:    s_setpc_b64 s[30:31]
18146;
18147; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
18148; GFX7:       ; %bb.0:
18149; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18150; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
18151; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
18152; GFX7-NEXT:    flat_load_dword v5, v[0:1]
18153; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
18154; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
18155; GFX7-NEXT:    s_mov_b64 s[4:5], 0
18156; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18157; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18158; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18159; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
18160; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
18161; GFX7-NEXT:  .LBB73_1: ; %atomicrmw.start
18162; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
18163; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
18164; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
18165; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
18166; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
18167; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
18168; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
18169; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
18170; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
18171; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
18172; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
18173; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
18174; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18175; GFX7-NEXT:    buffer_wbinvl1
18176; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
18177; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
18178; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
18179; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
18180; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
18181; GFX7-NEXT:    s_cbranch_execnz .LBB73_1
18182; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
18183; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
18184; GFX7-NEXT:    s_setpc_b64 s[30:31]
18185  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
18186  %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
18187  ret void
18188}
18189
18190define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
18191; GFX12-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18192; GFX12:       ; %bb.0:
18193; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
18194; GFX12-NEXT:    s_wait_expcnt 0x0
18195; GFX12-NEXT:    s_wait_samplecnt 0x0
18196; GFX12-NEXT:    s_wait_bvhcnt 0x0
18197; GFX12-NEXT:    s_wait_kmcnt 0x0
18198; GFX12-NEXT:    global_wb scope:SCOPE_SYS
18199; GFX12-NEXT:    s_wait_storecnt 0x0
18200; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
18201; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
18202; GFX12-NEXT:    global_inv scope:SCOPE_SYS
18203; GFX12-NEXT:    s_setpc_b64 s[30:31]
18204;
18205; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18206; GFX940:       ; %bb.0:
18207; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18208; GFX940-NEXT:    buffer_wbl2 sc0 sc1
18209; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 sc1
18210; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18211; GFX940-NEXT:    buffer_inv sc0 sc1
18212; GFX940-NEXT:    s_setpc_b64 s[30:31]
18213;
18214; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18215; GFX11:       ; %bb.0:
18216; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18217; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
18218; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18219; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18220; GFX11-NEXT:    s_mov_b32 s1, 0
18221; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
18222; GFX11-NEXT:    .p2align 6
18223; GFX11-NEXT:  .LBB74_1: ; %atomicrmw.start
18224; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
18225; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18226; GFX11-NEXT:    v_mov_b32_e32 v6, v3
18227; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
18228; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
18229; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
18230; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
18231; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
18232; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
18233; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
18234; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
18235; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
18236; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
18237; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
18238; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
18239; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
18240; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
18241; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
18242; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
18243; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
18244; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
18245; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
18246; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
18247; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
18248; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
18249; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18250; GFX11-NEXT:    buffer_gl1_inv
18251; GFX11-NEXT:    buffer_gl0_inv
18252; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
18253; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
18254; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
18255; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
18256; GFX11-NEXT:    s_cbranch_execnz .LBB74_1
18257; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
18258; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
18259; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
18260; GFX11-NEXT:    v_mov_b32_e32 v0, v3
18261; GFX11-NEXT:    s_setpc_b64 s[30:31]
18262;
18263; GFX10-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18264; GFX10:       ; %bb.0:
18265; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18266; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
18267; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
18268; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
18269; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18270; GFX10-NEXT:    s_mov_b32 s5, 0
18271; GFX10-NEXT:    flat_load_dword v0, v[3:4]
18272; GFX10-NEXT:  .LBB74_1: ; %atomicrmw.start
18273; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
18274; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18275; GFX10-NEXT:    v_mov_b32_e32 v6, v0
18276; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
18277; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
18278; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
18279; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
18280; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
18281; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
18282; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
18283; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
18284; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
18285; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
18286; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
18287; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
18288; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
18289; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
18290; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
18291; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
18292; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
18293; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18294; GFX10-NEXT:    buffer_gl1_inv
18295; GFX10-NEXT:    buffer_gl0_inv
18296; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
18297; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
18298; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
18299; GFX10-NEXT:    s_cbranch_execnz .LBB74_1
18300; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
18301; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
18302; GFX10-NEXT:    s_setpc_b64 s[30:31]
18303;
18304; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18305; GFX90A:       ; %bb.0:
18306; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18307; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
18308; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
18309; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18310; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
18311; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18312; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
18313; GFX90A-NEXT:  .LBB74_1: ; %atomicrmw.start
18314; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
18315; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18316; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
18317; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
18318; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
18319; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
18320; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
18321; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
18322; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
18323; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
18324; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
18325; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
18326; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
18327; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
18328; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
18329; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
18330; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
18331; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
18332; GFX90A-NEXT:    buffer_wbl2
18333; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
18334; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18335; GFX90A-NEXT:    buffer_invl2
18336; GFX90A-NEXT:    buffer_wbinvl1
18337; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
18338; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
18339; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
18340; GFX90A-NEXT:    s_cbranch_execnz .LBB74_1
18341; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
18342; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
18343; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
18344; GFX90A-NEXT:    s_setpc_b64 s[30:31]
18345;
18346; GFX908-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18347; GFX908:       ; %bb.0:
18348; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18349; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
18350; GFX908-NEXT:    s_mov_b64 s[6:7], 0
18351; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18352; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
18353; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18354; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
18355; GFX908-NEXT:  .LBB74_1: ; %atomicrmw.start
18356; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
18357; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18358; GFX908-NEXT:    v_mov_b32_e32 v6, v3
18359; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
18360; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
18361; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
18362; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
18363; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
18364; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
18365; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
18366; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
18367; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
18368; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
18369; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
18370; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
18371; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
18372; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
18373; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
18374; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
18375; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18376; GFX908-NEXT:    buffer_wbinvl1
18377; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
18378; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
18379; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
18380; GFX908-NEXT:    s_cbranch_execnz .LBB74_1
18381; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
18382; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
18383; GFX908-NEXT:    v_mov_b32_e32 v0, v3
18384; GFX908-NEXT:    s_setpc_b64 s[30:31]
18385;
18386; GFX8-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18387; GFX8:       ; %bb.0:
18388; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18389; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
18390; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
18391; GFX8-NEXT:    flat_load_dword v0, v[3:4]
18392; GFX8-NEXT:    s_mov_b64 s[6:7], 0
18393; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
18394; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18395; GFX8-NEXT:  .LBB74_1: ; %atomicrmw.start
18396; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
18397; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18398; GFX8-NEXT:    v_mov_b32_e32 v6, v0
18399; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
18400; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
18401; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
18402; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
18403; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
18404; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
18405; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
18406; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
18407; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
18408; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
18409; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
18410; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
18411; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
18412; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
18413; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
18414; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
18415; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
18416; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
18417; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
18418; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18419; GFX8-NEXT:    buffer_wbinvl1
18420; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
18421; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
18422; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
18423; GFX8-NEXT:    s_cbranch_execnz .LBB74_1
18424; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
18425; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
18426; GFX8-NEXT:    s_setpc_b64 s[30:31]
18427;
18428; GFX7-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18429; GFX7:       ; %bb.0:
18430; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18431; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
18432; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
18433; GFX7-NEXT:    flat_load_dword v0, v[4:5]
18434; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
18435; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
18436; GFX7-NEXT:    s_mov_b64 s[4:5], 0
18437; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
18438; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18439; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18440; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
18441; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
18442; GFX7-NEXT:  .LBB74_1: ; %atomicrmw.start
18443; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
18444; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
18445; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
18446; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
18447; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
18448; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
18449; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
18450; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
18451; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
18452; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
18453; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
18454; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
18455; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18456; GFX7-NEXT:    buffer_wbinvl1
18457; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
18458; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
18459; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
18460; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
18461; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
18462; GFX7-NEXT:    s_cbranch_execnz .LBB74_1
18463; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
18464; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
18465; GFX7-NEXT:    s_setpc_b64 s[30:31]
18466  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
18467  %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
18468  ret <2 x bfloat> %result
18469}
18470
18471define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
18472; GFX12-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18473; GFX12:       ; %bb.0:
18474; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
18475; GFX12-NEXT:    s_wait_expcnt 0x0
18476; GFX12-NEXT:    s_wait_samplecnt 0x0
18477; GFX12-NEXT:    s_wait_bvhcnt 0x0
18478; GFX12-NEXT:    s_wait_kmcnt 0x0
18479; GFX12-NEXT:    global_wb scope:SCOPE_SYS
18480; GFX12-NEXT:    s_wait_storecnt 0x0
18481; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_SYS
18482; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
18483; GFX12-NEXT:    global_inv scope:SCOPE_SYS
18484; GFX12-NEXT:    s_setpc_b64 s[30:31]
18485;
18486; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18487; GFX940:       ; %bb.0:
18488; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18489; GFX940-NEXT:    buffer_wbl2 sc0 sc1
18490; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 sc1
18491; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18492; GFX940-NEXT:    buffer_inv sc0 sc1
18493; GFX940-NEXT:    s_setpc_b64 s[30:31]
18494;
18495; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18496; GFX11:       ; %bb.0:
18497; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18498; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
18499; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18500; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
18501; GFX11-NEXT:    s_mov_b32 s1, 0
18502; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
18503; GFX11-NEXT:    .p2align 6
18504; GFX11-NEXT:  .LBB75_1: ; %atomicrmw.start
18505; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
18506; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18507; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
18508; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
18509; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
18510; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
18511; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
18512; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
18513; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
18514; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
18515; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
18516; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
18517; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
18518; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
18519; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
18520; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
18521; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
18522; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
18523; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
18524; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
18525; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
18526; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
18527; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
18528; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18529; GFX11-NEXT:    buffer_gl1_inv
18530; GFX11-NEXT:    buffer_gl0_inv
18531; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
18532; GFX11-NEXT:    v_mov_b32_e32 v3, v2
18533; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
18534; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
18535; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
18536; GFX11-NEXT:    s_cbranch_execnz .LBB75_1
18537; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
18538; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
18539; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
18540; GFX11-NEXT:    s_setpc_b64 s[30:31]
18541;
18542; GFX10-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18543; GFX10:       ; %bb.0:
18544; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18545; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
18546; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
18547; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18548; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
18549; GFX10-NEXT:    s_mov_b32 s5, 0
18550; GFX10-NEXT:    flat_load_dword v3, v[0:1]
18551; GFX10-NEXT:  .LBB75_1: ; %atomicrmw.start
18552; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
18553; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18554; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
18555; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
18556; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
18557; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
18558; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
18559; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
18560; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
18561; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
18562; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
18563; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
18564; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
18565; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
18566; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
18567; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
18568; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
18569; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
18570; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18571; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18572; GFX10-NEXT:    buffer_gl1_inv
18573; GFX10-NEXT:    buffer_gl0_inv
18574; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
18575; GFX10-NEXT:    v_mov_b32_e32 v3, v2
18576; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
18577; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
18578; GFX10-NEXT:    s_cbranch_execnz .LBB75_1
18579; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
18580; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
18581; GFX10-NEXT:    s_setpc_b64 s[30:31]
18582;
18583; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18584; GFX90A:       ; %bb.0:
18585; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18586; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
18587; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
18588; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18589; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
18590; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
18591; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
18592; GFX90A-NEXT:  .LBB75_1: ; %atomicrmw.start
18593; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
18594; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18595; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
18596; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
18597; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
18598; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
18599; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
18600; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
18601; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
18602; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
18603; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
18604; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
18605; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
18606; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
18607; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
18608; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
18609; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
18610; GFX90A-NEXT:    buffer_wbl2
18611; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
18612; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18613; GFX90A-NEXT:    buffer_invl2
18614; GFX90A-NEXT:    buffer_wbinvl1
18615; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
18616; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
18617; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
18618; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
18619; GFX90A-NEXT:    s_cbranch_execnz .LBB75_1
18620; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
18621; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
18622; GFX90A-NEXT:    s_setpc_b64 s[30:31]
18623;
18624; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18625; GFX908:       ; %bb.0:
18626; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18627; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
18628; GFX908-NEXT:    s_mov_b64 s[6:7], 0
18629; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18630; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
18631; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
18632; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
18633; GFX908-NEXT:  .LBB75_1: ; %atomicrmw.start
18634; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
18635; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18636; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
18637; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
18638; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
18639; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
18640; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
18641; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
18642; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
18643; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
18644; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
18645; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
18646; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
18647; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
18648; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
18649; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
18650; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
18651; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
18652; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18653; GFX908-NEXT:    buffer_wbinvl1
18654; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
18655; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
18656; GFX908-NEXT:    v_mov_b32_e32 v3, v2
18657; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
18658; GFX908-NEXT:    s_cbranch_execnz .LBB75_1
18659; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
18660; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
18661; GFX908-NEXT:    s_setpc_b64 s[30:31]
18662;
18663; GFX8-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18664; GFX8:       ; %bb.0:
18665; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18666; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
18667; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
18668; GFX8-NEXT:    flat_load_dword v3, v[0:1]
18669; GFX8-NEXT:    s_mov_b64 s[6:7], 0
18670; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18671; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
18672; GFX8-NEXT:  .LBB75_1: ; %atomicrmw.start
18673; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
18674; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18675; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
18676; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
18677; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
18678; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
18679; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
18680; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
18681; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
18682; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
18683; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
18684; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
18685; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
18686; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
18687; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
18688; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
18689; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
18690; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
18691; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
18692; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
18693; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18694; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18695; GFX8-NEXT:    buffer_wbinvl1
18696; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
18697; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
18698; GFX8-NEXT:    v_mov_b32_e32 v3, v2
18699; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
18700; GFX8-NEXT:    s_cbranch_execnz .LBB75_1
18701; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
18702; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
18703; GFX8-NEXT:    s_setpc_b64 s[30:31]
18704;
18705; GFX7-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
18706; GFX7:       ; %bb.0:
18707; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18708; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
18709; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
18710; GFX7-NEXT:    flat_load_dword v5, v[0:1]
18711; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
18712; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
18713; GFX7-NEXT:    s_mov_b64 s[4:5], 0
18714; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18715; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
18716; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18717; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
18718; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
18719; GFX7-NEXT:  .LBB75_1: ; %atomicrmw.start
18720; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
18721; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
18722; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
18723; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
18724; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
18725; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
18726; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
18727; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
18728; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
18729; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
18730; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
18731; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
18732; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18733; GFX7-NEXT:    buffer_wbinvl1
18734; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
18735; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
18736; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
18737; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
18738; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
18739; GFX7-NEXT:    s_cbranch_execnz .LBB75_1
18740; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
18741; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
18742; GFX7-NEXT:    s_setpc_b64 s[30:31]
18743  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
18744  %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
18745  ret void
18746}
18747
18748define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 {
18749; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
18750; GFX12:       ; %bb.0:
18751; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
18752; GFX12-NEXT:    s_wait_expcnt 0x0
18753; GFX12-NEXT:    s_wait_samplecnt 0x0
18754; GFX12-NEXT:    s_wait_bvhcnt 0x0
18755; GFX12-NEXT:    s_wait_kmcnt 0x0
18756; GFX12-NEXT:    s_wait_storecnt 0x0
18757; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
18758; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
18759; GFX12-NEXT:    global_inv scope:SCOPE_DEV
18760; GFX12-NEXT:    s_setpc_b64 s[30:31]
18761;
18762; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
18763; GFX940:       ; %bb.0:
18764; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18765; GFX940-NEXT:    buffer_wbl2 sc1
18766; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
18767; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18768; GFX940-NEXT:    buffer_inv sc1
18769; GFX940-NEXT:    s_setpc_b64 s[30:31]
18770;
18771; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
18772; GFX11:       ; %bb.0:
18773; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18774; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
18775; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18776; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18777; GFX11-NEXT:    s_mov_b32 s1, 0
18778; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
18779; GFX11-NEXT:    .p2align 6
18780; GFX11-NEXT:  .LBB76_1: ; %atomicrmw.start
18781; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
18782; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18783; GFX11-NEXT:    v_mov_b32_e32 v6, v3
18784; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
18785; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
18786; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
18787; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
18788; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
18789; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
18790; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
18791; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
18792; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
18793; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
18794; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
18795; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
18796; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
18797; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
18798; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
18799; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
18800; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
18801; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
18802; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
18803; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
18804; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
18805; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
18806; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18807; GFX11-NEXT:    buffer_gl1_inv
18808; GFX11-NEXT:    buffer_gl0_inv
18809; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
18810; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
18811; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
18812; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
18813; GFX11-NEXT:    s_cbranch_execnz .LBB76_1
18814; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
18815; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
18816; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
18817; GFX11-NEXT:    v_mov_b32_e32 v0, v3
18818; GFX11-NEXT:    s_setpc_b64 s[30:31]
18819;
18820; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
18821; GFX10:       ; %bb.0:
18822; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18823; GFX10-NEXT:    flat_load_dword v3, v[0:1]
18824; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18825; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18826; GFX10-NEXT:    s_mov_b32 s5, 0
18827; GFX10-NEXT:  .LBB76_1: ; %atomicrmw.start
18828; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
18829; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18830; GFX10-NEXT:    v_mov_b32_e32 v6, v3
18831; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
18832; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
18833; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
18834; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
18835; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
18836; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
18837; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
18838; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
18839; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
18840; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
18841; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
18842; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
18843; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
18844; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
18845; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
18846; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
18847; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
18848; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18849; GFX10-NEXT:    buffer_gl1_inv
18850; GFX10-NEXT:    buffer_gl0_inv
18851; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
18852; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
18853; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
18854; GFX10-NEXT:    s_cbranch_execnz .LBB76_1
18855; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
18856; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
18857; GFX10-NEXT:    v_mov_b32_e32 v0, v3
18858; GFX10-NEXT:    s_setpc_b64 s[30:31]
18859;
18860; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
18861; GFX90A:       ; %bb.0:
18862; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18863; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
18864; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
18865; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18866; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
18867; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18868; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
18869; GFX90A-NEXT:  .LBB76_1: ; %atomicrmw.start
18870; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
18871; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18872; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
18873; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
18874; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
18875; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
18876; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
18877; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
18878; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
18879; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
18880; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
18881; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
18882; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
18883; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
18884; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
18885; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
18886; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
18887; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
18888; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
18889; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18890; GFX90A-NEXT:    buffer_wbinvl1
18891; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
18892; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
18893; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
18894; GFX90A-NEXT:    s_cbranch_execnz .LBB76_1
18895; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
18896; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
18897; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
18898; GFX90A-NEXT:    s_setpc_b64 s[30:31]
18899;
18900; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
18901; GFX908:       ; %bb.0:
18902; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18903; GFX908-NEXT:    flat_load_dword v3, v[0:1]
18904; GFX908-NEXT:    s_mov_b64 s[6:7], 0
18905; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18906; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
18907; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18908; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
18909; GFX908-NEXT:  .LBB76_1: ; %atomicrmw.start
18910; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
18911; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18912; GFX908-NEXT:    v_mov_b32_e32 v6, v3
18913; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
18914; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
18915; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
18916; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
18917; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
18918; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
18919; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
18920; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
18921; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
18922; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
18923; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
18924; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
18925; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
18926; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
18927; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
18928; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
18929; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18930; GFX908-NEXT:    buffer_wbinvl1
18931; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
18932; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
18933; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
18934; GFX908-NEXT:    s_cbranch_execnz .LBB76_1
18935; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
18936; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
18937; GFX908-NEXT:    v_mov_b32_e32 v0, v3
18938; GFX908-NEXT:    s_setpc_b64 s[30:31]
18939;
18940; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
18941; GFX8:       ; %bb.0:
18942; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18943; GFX8-NEXT:    flat_load_dword v3, v[0:1]
18944; GFX8-NEXT:    s_mov_b64 s[6:7], 0
18945; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
18946; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
18947; GFX8-NEXT:  .LBB76_1: ; %atomicrmw.start
18948; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
18949; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18950; GFX8-NEXT:    v_mov_b32_e32 v6, v3
18951; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
18952; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
18953; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
18954; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
18955; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
18956; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
18957; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
18958; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
18959; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
18960; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
18961; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
18962; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
18963; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
18964; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
18965; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
18966; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
18967; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
18968; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
18969; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
18970; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18971; GFX8-NEXT:    buffer_wbinvl1
18972; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
18973; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
18974; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
18975; GFX8-NEXT:    s_cbranch_execnz .LBB76_1
18976; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
18977; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
18978; GFX8-NEXT:    v_mov_b32_e32 v0, v3
18979; GFX8-NEXT:    s_setpc_b64 s[30:31]
18980;
18981; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
18982; GFX7:       ; %bb.0:
18983; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18984; GFX7-NEXT:    flat_load_dword v5, v[0:1]
18985; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
18986; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
18987; GFX7-NEXT:    s_mov_b64 s[4:5], 0
18988; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
18989; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18990; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
18991; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
18992; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
18993; GFX7-NEXT:  .LBB76_1: ; %atomicrmw.start
18994; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
18995; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
18996; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
18997; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
18998; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
18999; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
19000; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
19001; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
19002; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
19003; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
19004; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
19005; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
19006; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19007; GFX7-NEXT:    buffer_wbinvl1
19008; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
19009; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
19010; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
19011; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
19012; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
19013; GFX7-NEXT:    s_cbranch_execnz .LBB76_1
19014; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
19015; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
19016; GFX7-NEXT:    v_mov_b32_e32 v0, v3
19017; GFX7-NEXT:    v_mov_b32_e32 v1, v2
19018; GFX7-NEXT:    s_setpc_b64 s[30:31]
19019  %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
19020  ret <2 x bfloat> %result
19021}
19022
19023define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 {
19024; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
19025; GFX12:       ; %bb.0:
19026; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
19027; GFX12-NEXT:    s_wait_expcnt 0x0
19028; GFX12-NEXT:    s_wait_samplecnt 0x0
19029; GFX12-NEXT:    s_wait_bvhcnt 0x0
19030; GFX12-NEXT:    s_wait_kmcnt 0x0
19031; GFX12-NEXT:    s_wait_storecnt 0x0
19032; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV
19033; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
19034; GFX12-NEXT:    global_inv scope:SCOPE_DEV
19035; GFX12-NEXT:    s_setpc_b64 s[30:31]
19036;
19037; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
19038; GFX940:       ; %bb.0:
19039; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19040; GFX940-NEXT:    buffer_wbl2 sc1
19041; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
19042; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19043; GFX940-NEXT:    buffer_inv sc1
19044; GFX940-NEXT:    s_setpc_b64 s[30:31]
19045;
19046; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
19047; GFX11:       ; %bb.0:
19048; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19049; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
19050; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19051; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
19052; GFX11-NEXT:    s_mov_b32 s1, 0
19053; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
19054; GFX11-NEXT:    .p2align 6
19055; GFX11-NEXT:  .LBB77_1: ; %atomicrmw.start
19056; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
19057; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19058; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
19059; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
19060; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
19061; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
19062; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
19063; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
19064; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
19065; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
19066; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
19067; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
19068; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
19069; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
19070; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
19071; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
19072; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
19073; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
19074; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
19075; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
19076; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
19077; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
19078; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
19079; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19080; GFX11-NEXT:    buffer_gl1_inv
19081; GFX11-NEXT:    buffer_gl0_inv
19082; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
19083; GFX11-NEXT:    v_mov_b32_e32 v3, v2
19084; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
19085; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
19086; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
19087; GFX11-NEXT:    s_cbranch_execnz .LBB77_1
19088; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
19089; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
19090; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
19091; GFX11-NEXT:    s_setpc_b64 s[30:31]
19092;
19093; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
19094; GFX10:       ; %bb.0:
19095; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19096; GFX10-NEXT:    flat_load_dword v3, v[0:1]
19097; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19098; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
19099; GFX10-NEXT:    s_mov_b32 s5, 0
19100; GFX10-NEXT:  .LBB77_1: ; %atomicrmw.start
19101; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
19102; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19103; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
19104; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
19105; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
19106; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
19107; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
19108; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
19109; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
19110; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
19111; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
19112; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
19113; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
19114; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
19115; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
19116; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
19117; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
19118; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
19119; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19120; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19121; GFX10-NEXT:    buffer_gl1_inv
19122; GFX10-NEXT:    buffer_gl0_inv
19123; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
19124; GFX10-NEXT:    v_mov_b32_e32 v3, v2
19125; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
19126; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
19127; GFX10-NEXT:    s_cbranch_execnz .LBB77_1
19128; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
19129; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
19130; GFX10-NEXT:    s_setpc_b64 s[30:31]
19131;
19132; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
19133; GFX90A:       ; %bb.0:
19134; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19135; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
19136; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
19137; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19138; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
19139; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
19140; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
19141; GFX90A-NEXT:  .LBB77_1: ; %atomicrmw.start
19142; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
19143; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19144; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
19145; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
19146; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
19147; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
19148; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
19149; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
19150; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
19151; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
19152; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
19153; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
19154; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
19155; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
19156; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
19157; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
19158; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
19159; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19160; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19161; GFX90A-NEXT:    buffer_wbinvl1
19162; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
19163; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
19164; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
19165; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
19166; GFX90A-NEXT:    s_cbranch_execnz .LBB77_1
19167; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
19168; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
19169; GFX90A-NEXT:    s_setpc_b64 s[30:31]
19170;
19171; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
19172; GFX908:       ; %bb.0:
19173; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19174; GFX908-NEXT:    flat_load_dword v3, v[0:1]
19175; GFX908-NEXT:    s_mov_b64 s[6:7], 0
19176; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19177; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
19178; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
19179; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
19180; GFX908-NEXT:  .LBB77_1: ; %atomicrmw.start
19181; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
19182; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19183; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
19184; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
19185; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
19186; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
19187; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
19188; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
19189; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
19190; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
19191; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
19192; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
19193; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
19194; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
19195; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
19196; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
19197; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
19198; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19199; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19200; GFX908-NEXT:    buffer_wbinvl1
19201; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
19202; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
19203; GFX908-NEXT:    v_mov_b32_e32 v3, v2
19204; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
19205; GFX908-NEXT:    s_cbranch_execnz .LBB77_1
19206; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
19207; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
19208; GFX908-NEXT:    s_setpc_b64 s[30:31]
19209;
19210; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
19211; GFX8:       ; %bb.0:
19212; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19213; GFX8-NEXT:    flat_load_dword v3, v[0:1]
19214; GFX8-NEXT:    s_mov_b64 s[6:7], 0
19215; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19216; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
19217; GFX8-NEXT:  .LBB77_1: ; %atomicrmw.start
19218; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
19219; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19220; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
19221; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
19222; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
19223; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
19224; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
19225; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
19226; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
19227; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
19228; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
19229; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
19230; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
19231; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
19232; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
19233; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
19234; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
19235; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
19236; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
19237; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
19238; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19239; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19240; GFX8-NEXT:    buffer_wbinvl1
19241; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
19242; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
19243; GFX8-NEXT:    v_mov_b32_e32 v3, v2
19244; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
19245; GFX8-NEXT:    s_cbranch_execnz .LBB77_1
19246; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
19247; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
19248; GFX8-NEXT:    s_setpc_b64 s[30:31]
19249;
19250; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
19251; GFX7:       ; %bb.0:
19252; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19253; GFX7-NEXT:    flat_load_dword v5, v[0:1]
19254; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
19255; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
19256; GFX7-NEXT:    s_mov_b64 s[4:5], 0
19257; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19258; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
19259; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19260; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
19261; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
19262; GFX7-NEXT:  .LBB77_1: ; %atomicrmw.start
19263; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
19264; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
19265; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
19266; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
19267; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
19268; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
19269; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
19270; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
19271; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
19272; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
19273; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
19274; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
19275; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19276; GFX7-NEXT:    buffer_wbinvl1
19277; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
19278; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
19279; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
19280; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
19281; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
19282; GFX7-NEXT:    s_cbranch_execnz .LBB77_1
19283; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
19284; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
19285; GFX7-NEXT:    s_setpc_b64 s[30:31]
19286  %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
19287  ret void
19288}
19289
19290define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 {
19291; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19292; GFX12:       ; %bb.0:
19293; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
19294; GFX12-NEXT:    s_wait_expcnt 0x0
19295; GFX12-NEXT:    s_wait_samplecnt 0x0
19296; GFX12-NEXT:    s_wait_bvhcnt 0x0
19297; GFX12-NEXT:    s_wait_kmcnt 0x0
19298; GFX12-NEXT:    s_wait_storecnt 0x0
19299; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
19300; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
19301; GFX12-NEXT:    global_inv scope:SCOPE_DEV
19302; GFX12-NEXT:    s_setpc_b64 s[30:31]
19303;
19304; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19305; GFX940:       ; %bb.0:
19306; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19307; GFX940-NEXT:    buffer_wbl2 sc1
19308; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
19309; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19310; GFX940-NEXT:    buffer_inv sc1
19311; GFX940-NEXT:    s_setpc_b64 s[30:31]
19312;
19313; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19314; GFX11:       ; %bb.0:
19315; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19316; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
19317; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19318; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19319; GFX11-NEXT:    s_mov_b32 s1, 0
19320; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
19321; GFX11-NEXT:    .p2align 6
19322; GFX11-NEXT:  .LBB78_1: ; %atomicrmw.start
19323; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
19324; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19325; GFX11-NEXT:    v_mov_b32_e32 v6, v3
19326; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
19327; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
19328; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
19329; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
19330; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
19331; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
19332; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
19333; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
19334; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
19335; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
19336; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
19337; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
19338; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
19339; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
19340; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
19341; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
19342; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
19343; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
19344; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
19345; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
19346; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
19347; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
19348; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19349; GFX11-NEXT:    buffer_gl1_inv
19350; GFX11-NEXT:    buffer_gl0_inv
19351; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
19352; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
19353; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
19354; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
19355; GFX11-NEXT:    s_cbranch_execnz .LBB78_1
19356; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
19357; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
19358; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
19359; GFX11-NEXT:    v_mov_b32_e32 v0, v3
19360; GFX11-NEXT:    s_setpc_b64 s[30:31]
19361;
19362; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19363; GFX10:       ; %bb.0:
19364; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19365; GFX10-NEXT:    flat_load_dword v3, v[0:1]
19366; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19367; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19368; GFX10-NEXT:    s_mov_b32 s5, 0
19369; GFX10-NEXT:  .LBB78_1: ; %atomicrmw.start
19370; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
19371; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19372; GFX10-NEXT:    v_mov_b32_e32 v6, v3
19373; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
19374; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
19375; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
19376; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
19377; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
19378; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
19379; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
19380; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
19381; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
19382; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
19383; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
19384; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
19385; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
19386; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
19387; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
19388; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
19389; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
19390; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19391; GFX10-NEXT:    buffer_gl1_inv
19392; GFX10-NEXT:    buffer_gl0_inv
19393; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
19394; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
19395; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
19396; GFX10-NEXT:    s_cbranch_execnz .LBB78_1
19397; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
19398; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
19399; GFX10-NEXT:    v_mov_b32_e32 v0, v3
19400; GFX10-NEXT:    s_setpc_b64 s[30:31]
19401;
19402; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19403; GFX90A:       ; %bb.0:
19404; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19405; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
19406; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
19407; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19408; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
19409; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19410; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
19411; GFX90A-NEXT:  .LBB78_1: ; %atomicrmw.start
19412; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
19413; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19414; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
19415; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
19416; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
19417; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
19418; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
19419; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
19420; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
19421; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
19422; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
19423; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
19424; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
19425; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
19426; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
19427; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
19428; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
19429; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
19430; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
19431; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19432; GFX90A-NEXT:    buffer_wbinvl1
19433; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
19434; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
19435; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
19436; GFX90A-NEXT:    s_cbranch_execnz .LBB78_1
19437; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
19438; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
19439; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
19440; GFX90A-NEXT:    s_setpc_b64 s[30:31]
19441;
19442; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19443; GFX908:       ; %bb.0:
19444; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19445; GFX908-NEXT:    flat_load_dword v3, v[0:1]
19446; GFX908-NEXT:    s_mov_b64 s[6:7], 0
19447; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19448; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
19449; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19450; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
19451; GFX908-NEXT:  .LBB78_1: ; %atomicrmw.start
19452; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
19453; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19454; GFX908-NEXT:    v_mov_b32_e32 v6, v3
19455; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
19456; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
19457; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
19458; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
19459; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
19460; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
19461; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
19462; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
19463; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
19464; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
19465; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
19466; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
19467; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
19468; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
19469; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
19470; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
19471; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19472; GFX908-NEXT:    buffer_wbinvl1
19473; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
19474; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
19475; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
19476; GFX908-NEXT:    s_cbranch_execnz .LBB78_1
19477; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
19478; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
19479; GFX908-NEXT:    v_mov_b32_e32 v0, v3
19480; GFX908-NEXT:    s_setpc_b64 s[30:31]
19481;
19482; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19483; GFX8:       ; %bb.0:
19484; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19485; GFX8-NEXT:    flat_load_dword v3, v[0:1]
19486; GFX8-NEXT:    s_mov_b64 s[6:7], 0
19487; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19488; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19489; GFX8-NEXT:  .LBB78_1: ; %atomicrmw.start
19490; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
19491; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19492; GFX8-NEXT:    v_mov_b32_e32 v6, v3
19493; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
19494; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
19495; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
19496; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
19497; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
19498; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
19499; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
19500; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
19501; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
19502; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
19503; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
19504; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
19505; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
19506; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
19507; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
19508; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
19509; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
19510; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
19511; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
19512; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19513; GFX8-NEXT:    buffer_wbinvl1
19514; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
19515; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
19516; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
19517; GFX8-NEXT:    s_cbranch_execnz .LBB78_1
19518; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
19519; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
19520; GFX8-NEXT:    v_mov_b32_e32 v0, v3
19521; GFX8-NEXT:    s_setpc_b64 s[30:31]
19522;
19523; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19524; GFX7:       ; %bb.0:
19525; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19526; GFX7-NEXT:    flat_load_dword v5, v[0:1]
19527; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
19528; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
19529; GFX7-NEXT:    s_mov_b64 s[4:5], 0
19530; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
19531; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19532; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
19533; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
19534; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
19535; GFX7-NEXT:  .LBB78_1: ; %atomicrmw.start
19536; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
19537; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
19538; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
19539; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
19540; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
19541; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
19542; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
19543; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
19544; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
19545; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
19546; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
19547; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
19548; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19549; GFX7-NEXT:    buffer_wbinvl1
19550; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
19551; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
19552; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
19553; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
19554; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
19555; GFX7-NEXT:    s_cbranch_execnz .LBB78_1
19556; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
19557; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
19558; GFX7-NEXT:    v_mov_b32_e32 v0, v3
19559; GFX7-NEXT:    v_mov_b32_e32 v1, v2
19560; GFX7-NEXT:    s_setpc_b64 s[30:31]
19561  %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
19562  ret <2 x bfloat> %result
19563}
19564
19565define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 {
19566; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19567; GFX12:       ; %bb.0:
19568; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
19569; GFX12-NEXT:    s_wait_expcnt 0x0
19570; GFX12-NEXT:    s_wait_samplecnt 0x0
19571; GFX12-NEXT:    s_wait_bvhcnt 0x0
19572; GFX12-NEXT:    s_wait_kmcnt 0x0
19573; GFX12-NEXT:    s_wait_storecnt 0x0
19574; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV
19575; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
19576; GFX12-NEXT:    global_inv scope:SCOPE_DEV
19577; GFX12-NEXT:    s_setpc_b64 s[30:31]
19578;
19579; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19580; GFX940:       ; %bb.0:
19581; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19582; GFX940-NEXT:    buffer_wbl2 sc1
19583; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
19584; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19585; GFX940-NEXT:    buffer_inv sc1
19586; GFX940-NEXT:    s_setpc_b64 s[30:31]
19587;
19588; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19589; GFX11:       ; %bb.0:
19590; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19591; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
19592; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19593; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
19594; GFX11-NEXT:    s_mov_b32 s1, 0
19595; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
19596; GFX11-NEXT:    .p2align 6
19597; GFX11-NEXT:  .LBB79_1: ; %atomicrmw.start
19598; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
19599; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19600; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
19601; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
19602; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
19603; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
19604; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
19605; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
19606; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
19607; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
19608; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
19609; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
19610; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
19611; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
19612; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
19613; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
19614; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
19615; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
19616; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
19617; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
19618; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
19619; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
19620; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
19621; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19622; GFX11-NEXT:    buffer_gl1_inv
19623; GFX11-NEXT:    buffer_gl0_inv
19624; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
19625; GFX11-NEXT:    v_mov_b32_e32 v3, v2
19626; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
19627; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
19628; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
19629; GFX11-NEXT:    s_cbranch_execnz .LBB79_1
19630; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
19631; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
19632; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
19633; GFX11-NEXT:    s_setpc_b64 s[30:31]
19634;
19635; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19636; GFX10:       ; %bb.0:
19637; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19638; GFX10-NEXT:    flat_load_dword v3, v[0:1]
19639; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19640; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
19641; GFX10-NEXT:    s_mov_b32 s5, 0
19642; GFX10-NEXT:  .LBB79_1: ; %atomicrmw.start
19643; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
19644; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19645; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
19646; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
19647; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
19648; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
19649; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
19650; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
19651; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
19652; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
19653; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
19654; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
19655; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
19656; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
19657; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
19658; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
19659; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
19660; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
19661; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19662; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19663; GFX10-NEXT:    buffer_gl1_inv
19664; GFX10-NEXT:    buffer_gl0_inv
19665; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
19666; GFX10-NEXT:    v_mov_b32_e32 v3, v2
19667; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
19668; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
19669; GFX10-NEXT:    s_cbranch_execnz .LBB79_1
19670; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
19671; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
19672; GFX10-NEXT:    s_setpc_b64 s[30:31]
19673;
19674; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19675; GFX90A:       ; %bb.0:
19676; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19677; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
19678; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
19679; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19680; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
19681; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
19682; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
19683; GFX90A-NEXT:  .LBB79_1: ; %atomicrmw.start
19684; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
19685; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19686; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
19687; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
19688; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
19689; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
19690; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
19691; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
19692; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
19693; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
19694; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
19695; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
19696; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
19697; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
19698; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
19699; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
19700; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
19701; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19702; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19703; GFX90A-NEXT:    buffer_wbinvl1
19704; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
19705; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
19706; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
19707; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
19708; GFX90A-NEXT:    s_cbranch_execnz .LBB79_1
19709; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
19710; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
19711; GFX90A-NEXT:    s_setpc_b64 s[30:31]
19712;
19713; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19714; GFX908:       ; %bb.0:
19715; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19716; GFX908-NEXT:    flat_load_dword v3, v[0:1]
19717; GFX908-NEXT:    s_mov_b64 s[6:7], 0
19718; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19719; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
19720; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
19721; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
19722; GFX908-NEXT:  .LBB79_1: ; %atomicrmw.start
19723; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
19724; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19725; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
19726; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
19727; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
19728; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
19729; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
19730; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
19731; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
19732; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
19733; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
19734; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
19735; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
19736; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
19737; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
19738; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
19739; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
19740; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19741; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19742; GFX908-NEXT:    buffer_wbinvl1
19743; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
19744; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
19745; GFX908-NEXT:    v_mov_b32_e32 v3, v2
19746; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
19747; GFX908-NEXT:    s_cbranch_execnz .LBB79_1
19748; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
19749; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
19750; GFX908-NEXT:    s_setpc_b64 s[30:31]
19751;
19752; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19753; GFX8:       ; %bb.0:
19754; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19755; GFX8-NEXT:    flat_load_dword v3, v[0:1]
19756; GFX8-NEXT:    s_mov_b64 s[6:7], 0
19757; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
19758; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
19759; GFX8-NEXT:  .LBB79_1: ; %atomicrmw.start
19760; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
19761; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19762; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
19763; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
19764; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
19765; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
19766; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
19767; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
19768; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
19769; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
19770; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
19771; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
19772; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
19773; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
19774; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
19775; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
19776; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
19777; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
19778; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
19779; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
19780; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19781; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19782; GFX8-NEXT:    buffer_wbinvl1
19783; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
19784; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
19785; GFX8-NEXT:    v_mov_b32_e32 v3, v2
19786; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
19787; GFX8-NEXT:    s_cbranch_execnz .LBB79_1
19788; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
19789; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
19790; GFX8-NEXT:    s_setpc_b64 s[30:31]
19791;
19792; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
19793; GFX7:       ; %bb.0:
19794; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19795; GFX7-NEXT:    flat_load_dword v5, v[0:1]
19796; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
19797; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
19798; GFX7-NEXT:    s_mov_b64 s[4:5], 0
19799; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
19800; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
19801; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19802; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
19803; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
19804; GFX7-NEXT:  .LBB79_1: ; %atomicrmw.start
19805; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
19806; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
19807; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
19808; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
19809; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
19810; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
19811; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
19812; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
19813; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
19814; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
19815; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
19816; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
19817; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19818; GFX7-NEXT:    buffer_wbinvl1
19819; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
19820; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
19821; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
19822; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
19823; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
19824; GFX7-NEXT:    s_cbranch_execnz .LBB79_1
19825; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
19826; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
19827; GFX7-NEXT:    s_setpc_b64 s[30:31]
19828  %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
19829  ret void
19830}
19831
19832attributes #0 = { nounwind }
19833attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
19834
19835!0 = !{}
19836