xref: /llvm-project/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (revision bfd9bc274586b0261e16e22ac50d50586a0152e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
7; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
8; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
9; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
10; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
11
12; --------------------------------------------------------------------
13; float
14; --------------------------------------------------------------------
15
16define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 {
17; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
18; GFX12:       ; %bb.0:
19; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
20; GFX12-NEXT:    s_wait_expcnt 0x0
21; GFX12-NEXT:    s_wait_samplecnt 0x0
22; GFX12-NEXT:    s_wait_bvhcnt 0x0
23; GFX12-NEXT:    s_wait_kmcnt 0x0
24; GFX12-NEXT:    v_mov_b32_e32 v1, s16
25; GFX12-NEXT:    s_wait_storecnt 0x0
26; GFX12-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
27; GFX12-NEXT:    s_wait_loadcnt 0x0
28; GFX12-NEXT:    global_inv scope:SCOPE_DEV
29; GFX12-NEXT:    s_setpc_b64 s[30:31]
30;
31; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
32; GFX940:       ; %bb.0:
33; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34; GFX940-NEXT:    v_mov_b32_e32 v1, s16
35; GFX940-NEXT:    buffer_wbl2 sc1
36; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
37; GFX940-NEXT:    s_waitcnt vmcnt(0)
38; GFX940-NEXT:    buffer_inv sc1
39; GFX940-NEXT:    s_setpc_b64 s[30:31]
40;
41; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
42; GFX11:       ; %bb.0:
43; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
44; GFX11-NEXT:    v_mov_b32_e32 v1, s16
45; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
46; GFX11-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
47; GFX11-NEXT:    s_waitcnt vmcnt(0)
48; GFX11-NEXT:    buffer_gl1_inv
49; GFX11-NEXT:    buffer_gl0_inv
50; GFX11-NEXT:    s_setpc_b64 s[30:31]
51;
52; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
53; GFX10:       ; %bb.0:
54; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
55; GFX10-NEXT:    v_mov_b32_e32 v2, v0
56; GFX10-NEXT:    v_mov_b32_e32 v0, s20
57; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
58; GFX10-NEXT:    v_mov_b32_e32 v3, s4
59; GFX10-NEXT:    s_mov_b32 s4, 0
60; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
61; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
62; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
63; GFX10-NEXT:    s_waitcnt vmcnt(0)
64; GFX10-NEXT:    v_mov_b32_e32 v5, v0
65; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
66; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
67; GFX10-NEXT:    v_mov_b32_e32 v0, v4
68; GFX10-NEXT:    v_mov_b32_e32 v1, v5
69; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
70; GFX10-NEXT:    s_waitcnt vmcnt(0)
71; GFX10-NEXT:    buffer_gl1_inv
72; GFX10-NEXT:    buffer_gl0_inv
73; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
74; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
75; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
76; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
77; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
78; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
79; GFX10-NEXT:    s_setpc_b64 s[30:31]
80;
81; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
82; GFX90A:       ; %bb.0:
83; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
84; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
85; GFX90A-NEXT:    buffer_atomic_add_f32 v0, v1, s[16:19], 0 offen offset:1024 glc
86; GFX90A-NEXT:    s_waitcnt vmcnt(0)
87; GFX90A-NEXT:    buffer_wbinvl1
88; GFX90A-NEXT:    s_setpc_b64 s[30:31]
89;
90; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
91; GFX908:       ; %bb.0:
92; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93; GFX908-NEXT:    v_mov_b32_e32 v2, v0
94; GFX908-NEXT:    v_mov_b32_e32 v0, s20
95; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
96; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
97; GFX908-NEXT:    s_mov_b64 s[4:5], 0
98; GFX908-NEXT:    v_mov_b32_e32 v3, s6
99; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
100; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
101; GFX908-NEXT:    s_waitcnt vmcnt(0)
102; GFX908-NEXT:    v_mov_b32_e32 v5, v0
103; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
104; GFX908-NEXT:    v_mov_b32_e32 v0, v4
105; GFX908-NEXT:    v_mov_b32_e32 v1, v5
106; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
107; GFX908-NEXT:    s_waitcnt vmcnt(0)
108; GFX908-NEXT:    buffer_wbinvl1
109; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
110; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
111; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
112; GFX908-NEXT:    s_cbranch_execnz .LBB0_1
113; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
114; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
115; GFX908-NEXT:    s_setpc_b64 s[30:31]
116;
117; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
118; GFX8:       ; %bb.0:
119; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
120; GFX8-NEXT:    v_mov_b32_e32 v2, v0
121; GFX8-NEXT:    v_mov_b32_e32 v0, s20
122; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
123; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
124; GFX8-NEXT:    s_mov_b64 s[4:5], 0
125; GFX8-NEXT:    v_mov_b32_e32 v3, s6
126; GFX8-NEXT:  .LBB0_1: ; %atomicrmw.start
127; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
128; GFX8-NEXT:    s_waitcnt vmcnt(0)
129; GFX8-NEXT:    v_mov_b32_e32 v5, v0
130; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
131; GFX8-NEXT:    v_mov_b32_e32 v0, v4
132; GFX8-NEXT:    v_mov_b32_e32 v1, v5
133; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
134; GFX8-NEXT:    s_waitcnt vmcnt(0)
135; GFX8-NEXT:    buffer_wbinvl1
136; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
137; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
138; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
139; GFX8-NEXT:    s_cbranch_execnz .LBB0_1
140; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
141; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
142; GFX8-NEXT:    s_setpc_b64 s[30:31]
143;
144; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
145; GFX7:       ; %bb.0:
146; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147; GFX7-NEXT:    v_mov_b32_e32 v2, v0
148; GFX7-NEXT:    v_mov_b32_e32 v0, s20
149; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
150; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
151; GFX7-NEXT:    s_mov_b64 s[4:5], 0
152; GFX7-NEXT:    v_mov_b32_e32 v3, s6
153; GFX7-NEXT:  .LBB0_1: ; %atomicrmw.start
154; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
155; GFX7-NEXT:    s_waitcnt vmcnt(0)
156; GFX7-NEXT:    v_mov_b32_e32 v5, v0
157; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
158; GFX7-NEXT:    v_mov_b32_e32 v0, v4
159; GFX7-NEXT:    v_mov_b32_e32 v1, v5
160; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
161; GFX7-NEXT:    s_waitcnt vmcnt(0)
162; GFX7-NEXT:    buffer_wbinvl1
163; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
164; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
165; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
166; GFX7-NEXT:    s_cbranch_execnz .LBB0_1
167; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
168; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
169; GFX7-NEXT:    s_setpc_b64 s[30:31]
170;
171; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
172; GFX6:       ; %bb.0:
173; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
174; GFX6-NEXT:    v_mov_b32_e32 v2, v0
175; GFX6-NEXT:    v_mov_b32_e32 v0, s20
176; GFX6-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
177; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
178; GFX6-NEXT:    s_mov_b64 s[4:5], 0
179; GFX6-NEXT:    v_mov_b32_e32 v3, s6
180; GFX6-NEXT:  .LBB0_1: ; %atomicrmw.start
181; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
182; GFX6-NEXT:    s_waitcnt vmcnt(0)
183; GFX6-NEXT:    v_mov_b32_e32 v5, v0
184; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
185; GFX6-NEXT:    s_waitcnt expcnt(0)
186; GFX6-NEXT:    v_mov_b32_e32 v0, v4
187; GFX6-NEXT:    v_mov_b32_e32 v1, v5
188; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
189; GFX6-NEXT:    s_waitcnt vmcnt(0)
190; GFX6-NEXT:    buffer_wbinvl1
191; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
192; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
193; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
194; GFX6-NEXT:    s_cbranch_execnz .LBB0_1
195; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
196; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
197; GFX6-NEXT:    s_waitcnt expcnt(0)
198; GFX6-NEXT:    s_setpc_b64 s[30:31]
199  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
200  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
201  ret float %result
202}
203
204define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 {
205; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
206; GFX12:       ; %bb.0:
207; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
208; GFX12-NEXT:    s_wait_expcnt 0x0
209; GFX12-NEXT:    s_wait_samplecnt 0x0
210; GFX12-NEXT:    s_wait_bvhcnt 0x0
211; GFX12-NEXT:    s_wait_kmcnt 0x0
212; GFX12-NEXT:    v_mov_b32_e32 v1, s16
213; GFX12-NEXT:    s_wait_storecnt 0x0
214; GFX12-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024
215; GFX12-NEXT:    s_wait_storecnt 0x0
216; GFX12-NEXT:    global_inv scope:SCOPE_DEV
217; GFX12-NEXT:    s_setpc_b64 s[30:31]
218;
219; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
220; GFX940:       ; %bb.0:
221; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
222; GFX940-NEXT:    v_mov_b32_e32 v1, s16
223; GFX940-NEXT:    buffer_wbl2 sc1
224; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024
225; GFX940-NEXT:    s_waitcnt vmcnt(0)
226; GFX940-NEXT:    buffer_inv sc1
227; GFX940-NEXT:    s_setpc_b64 s[30:31]
228;
229; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
230; GFX11:       ; %bb.0:
231; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
232; GFX11-NEXT:    v_mov_b32_e32 v1, s16
233; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
234; GFX11-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024
235; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
236; GFX11-NEXT:    buffer_gl1_inv
237; GFX11-NEXT:    buffer_gl0_inv
238; GFX11-NEXT:    s_setpc_b64 s[30:31]
239;
240; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
241; GFX10:       ; %bb.0:
242; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243; GFX10-NEXT:    v_mov_b32_e32 v1, s20
244; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
245; GFX10-NEXT:    v_mov_b32_e32 v3, s4
246; GFX10-NEXT:    s_mov_b32 s4, 0
247; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
248; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
249; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
250; GFX10-NEXT:    s_waitcnt vmcnt(0)
251; GFX10-NEXT:    v_add_f32_e32 v1, v2, v0
252; GFX10-NEXT:    v_mov_b32_e32 v5, v2
253; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
254; GFX10-NEXT:    v_mov_b32_e32 v4, v1
255; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
256; GFX10-NEXT:    s_waitcnt vmcnt(0)
257; GFX10-NEXT:    buffer_gl1_inv
258; GFX10-NEXT:    buffer_gl0_inv
259; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
260; GFX10-NEXT:    v_mov_b32_e32 v2, v4
261; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
262; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
263; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
264; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
265; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
266; GFX10-NEXT:    s_setpc_b64 s[30:31]
267;
268; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
269; GFX90A:       ; %bb.0:
270; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
271; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
272; GFX90A-NEXT:    buffer_atomic_add_f32 v0, v1, s[16:19], 0 offen offset:1024
273; GFX90A-NEXT:    s_waitcnt vmcnt(0)
274; GFX90A-NEXT:    buffer_wbinvl1
275; GFX90A-NEXT:    s_setpc_b64 s[30:31]
276;
277; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
278; GFX908:       ; %bb.0:
279; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; GFX908-NEXT:    v_mov_b32_e32 v1, s20
281; GFX908-NEXT:    buffer_atomic_add_f32 v0, v1, s[16:19], 0 offen offset:1024
282; GFX908-NEXT:    s_waitcnt vmcnt(0)
283; GFX908-NEXT:    buffer_wbinvl1
284; GFX908-NEXT:    s_setpc_b64 s[30:31]
285;
286; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
287; GFX8:       ; %bb.0:
288; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
289; GFX8-NEXT:    v_mov_b32_e32 v1, s20
290; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
291; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
292; GFX8-NEXT:    s_mov_b64 s[4:5], 0
293; GFX8-NEXT:    v_mov_b32_e32 v3, s6
294; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
295; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
296; GFX8-NEXT:    s_waitcnt vmcnt(0)
297; GFX8-NEXT:    v_add_f32_e32 v1, v2, v0
298; GFX8-NEXT:    v_mov_b32_e32 v5, v2
299; GFX8-NEXT:    v_mov_b32_e32 v4, v1
300; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
301; GFX8-NEXT:    s_waitcnt vmcnt(0)
302; GFX8-NEXT:    buffer_wbinvl1
303; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
304; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
305; GFX8-NEXT:    v_mov_b32_e32 v2, v4
306; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
307; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
308; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
309; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
310; GFX8-NEXT:    s_setpc_b64 s[30:31]
311;
312; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
313; GFX7:       ; %bb.0:
314; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
315; GFX7-NEXT:    v_mov_b32_e32 v1, s20
316; GFX7-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
317; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
318; GFX7-NEXT:    s_mov_b64 s[4:5], 0
319; GFX7-NEXT:    v_mov_b32_e32 v3, s6
320; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
321; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
322; GFX7-NEXT:    s_waitcnt vmcnt(0)
323; GFX7-NEXT:    v_add_f32_e32 v1, v2, v0
324; GFX7-NEXT:    v_mov_b32_e32 v5, v2
325; GFX7-NEXT:    v_mov_b32_e32 v4, v1
326; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
327; GFX7-NEXT:    s_waitcnt vmcnt(0)
328; GFX7-NEXT:    buffer_wbinvl1
329; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
330; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
331; GFX7-NEXT:    v_mov_b32_e32 v2, v4
332; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
333; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
334; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
335; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
336; GFX7-NEXT:    s_setpc_b64 s[30:31]
337;
338; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
339; GFX6:       ; %bb.0:
340; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
341; GFX6-NEXT:    v_mov_b32_e32 v1, s20
342; GFX6-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
343; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
344; GFX6-NEXT:    s_mov_b64 s[4:5], 0
345; GFX6-NEXT:    v_mov_b32_e32 v3, s6
346; GFX6-NEXT:  .LBB1_1: ; %atomicrmw.start
347; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
348; GFX6-NEXT:    s_waitcnt vmcnt(0)
349; GFX6-NEXT:    v_add_f32_e32 v1, v2, v0
350; GFX6-NEXT:    s_waitcnt expcnt(0)
351; GFX6-NEXT:    v_mov_b32_e32 v5, v2
352; GFX6-NEXT:    v_mov_b32_e32 v4, v1
353; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
354; GFX6-NEXT:    s_waitcnt vmcnt(0)
355; GFX6-NEXT:    buffer_wbinvl1
356; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
357; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
358; GFX6-NEXT:    v_mov_b32_e32 v2, v4
359; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
360; GFX6-NEXT:    s_cbranch_execnz .LBB1_1
361; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
362; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
363; GFX6-NEXT:    s_waitcnt expcnt(0)
364; GFX6-NEXT:    s_setpc_b64 s[30:31]
365  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
366  %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
367  ret void
368}
369
370define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) %ptr, float %val) #0 {
371; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
372; GFX12:       ; %bb.0:
373; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
374; GFX12-NEXT:    s_wait_expcnt 0x0
375; GFX12-NEXT:    s_wait_samplecnt 0x0
376; GFX12-NEXT:    s_wait_bvhcnt 0x0
377; GFX12-NEXT:    s_wait_kmcnt 0x0
378; GFX12-NEXT:    s_mov_b32 s1, exec_lo
379; GFX12-NEXT:    s_wait_storecnt 0x0
380; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
381; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
382; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
383; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
384; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
385; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
386; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
387; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
388; GFX12-NEXT:    s_wait_alu 0xfffe
389; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
390; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
391; GFX12-NEXT:    s_wait_alu 0xfffe
392; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
393; GFX12-NEXT:    s_wait_loadcnt 0x0
394; GFX12-NEXT:    buffer_atomic_add_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
395; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
396; GFX12-NEXT:    ; implicit-def: $vgpr4
397; GFX12-NEXT:    s_wait_alu 0xfffe
398; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
399; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
400; GFX12-NEXT:  ; %bb.2:
401; GFX12-NEXT:    s_mov_b32 exec_lo, s1
402; GFX12-NEXT:    s_wait_loadcnt 0x0
403; GFX12-NEXT:    v_mov_b32_e32 v0, v5
404; GFX12-NEXT:    global_inv scope:SCOPE_DEV
405; GFX12-NEXT:    s_wait_alu 0xfffe
406; GFX12-NEXT:    s_setpc_b64 s[30:31]
407;
408; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
409; GFX940:       ; %bb.0:
410; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
411; GFX940-NEXT:    s_mov_b64 s[2:3], exec
412; GFX940-NEXT:    buffer_wbl2 sc1
413; GFX940-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
414; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
415; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
416; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
417; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
418; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
419; GFX940-NEXT:    s_nop 0
420; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
421; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
422; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
423; GFX940-NEXT:    s_waitcnt vmcnt(0)
424; GFX940-NEXT:    buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 sc0
425; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
426; GFX940-NEXT:    ; implicit-def: $vgpr4
427; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
428; GFX940-NEXT:    s_cbranch_execnz .LBB2_1
429; GFX940-NEXT:  ; %bb.2:
430; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
431; GFX940-NEXT:    s_waitcnt vmcnt(0)
432; GFX940-NEXT:    v_mov_b32_e32 v0, v5
433; GFX940-NEXT:    buffer_inv sc1
434; GFX940-NEXT:    s_setpc_b64 s[30:31]
435;
436; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
437; GFX11:       ; %bb.0:
438; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
439; GFX11-NEXT:    s_mov_b32 s1, exec_lo
440; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
441; GFX11-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
442; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
443; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
444; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
445; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
446; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
447; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
448; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
449; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
450; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
451; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
452; GFX11-NEXT:    s_waitcnt vmcnt(0)
453; GFX11-NEXT:    buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 glc
454; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
455; GFX11-NEXT:    ; implicit-def: $vgpr4
456; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
457; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
458; GFX11-NEXT:  ; %bb.2:
459; GFX11-NEXT:    s_mov_b32 exec_lo, s1
460; GFX11-NEXT:    s_waitcnt vmcnt(0)
461; GFX11-NEXT:    v_mov_b32_e32 v0, v5
462; GFX11-NEXT:    buffer_gl1_inv
463; GFX11-NEXT:    buffer_gl0_inv
464; GFX11-NEXT:    s_setpc_b64 s[30:31]
465;
466; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
467; GFX10:       ; %bb.0:
468; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
469; GFX10-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
470; GFX10-NEXT:    s_mov_b32 s5, 0
471; GFX10-NEXT:    s_mov_b32 s6, exec_lo
472; GFX10-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
473; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
474; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
475; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
476; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
477; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
478; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
479; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
480; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
481; GFX10-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
482; GFX10-NEXT:    ; implicit-def: $vgpr4
483; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
484; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
485; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
486; GFX10-NEXT:  ; %bb.2:
487; GFX10-NEXT:    s_mov_b32 exec_lo, s6
488; GFX10-NEXT:  .LBB2_3: ; %atomicrmw.start
489; GFX10-NEXT:    ; =>This Loop Header: Depth=1
490; GFX10-NEXT:    ; Child Loop BB2_4 Depth 2
491; GFX10-NEXT:    s_waitcnt vmcnt(0)
492; GFX10-NEXT:    v_add_f32_e32 v7, v8, v5
493; GFX10-NEXT:    s_mov_b32 s6, exec_lo
494; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
495; GFX10-NEXT:    v_mov_b32_e32 v6, v7
496; GFX10-NEXT:    v_mov_b32_e32 v7, v8
497; GFX10-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
498; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
499; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
500; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
501; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
502; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
503; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
504; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
505; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
506; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
507; GFX10-NEXT:    s_waitcnt vmcnt(0)
508; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
509; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
510; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
511; GFX10-NEXT:    s_cbranch_execnz .LBB2_4
512; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
513; GFX10-NEXT:    s_mov_b32 exec_lo, s6
514; GFX10-NEXT:    s_waitcnt vmcnt(0)
515; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
516; GFX10-NEXT:    v_mov_b32_e32 v8, v6
517; GFX10-NEXT:    buffer_gl1_inv
518; GFX10-NEXT:    buffer_gl0_inv
519; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
520; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
521; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
522; GFX10-NEXT:    s_cbranch_execnz .LBB2_3
523; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
524; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
525; GFX10-NEXT:    v_mov_b32_e32 v0, v6
526; GFX10-NEXT:    s_setpc_b64 s[30:31]
527;
528; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
529; GFX90A:       ; %bb.0:
530; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
531; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
532; GFX90A-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
533; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
534; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
535; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
536; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
537; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
538; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
539; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
540; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
541; GFX90A-NEXT:    s_waitcnt vmcnt(0)
542; GFX90A-NEXT:    buffer_atomic_add_f32 v5, v4, s[8:11], 0 offen offset:1024 glc
543; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
544; GFX90A-NEXT:    ; implicit-def: $vgpr4
545; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
546; GFX90A-NEXT:    s_cbranch_execnz .LBB2_1
547; GFX90A-NEXT:  ; %bb.2:
548; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
549; GFX90A-NEXT:    s_waitcnt vmcnt(0)
550; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
551; GFX90A-NEXT:    buffer_wbinvl1
552; GFX90A-NEXT:    s_setpc_b64 s[30:31]
553;
554; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
555; GFX908:       ; %bb.0:
556; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
557; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
558; GFX908-NEXT:    s_mov_b64 s[6:7], exec
559; GFX908-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
560; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
561; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
562; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
563; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
564; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
565; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
566; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
567; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
568; GFX908-NEXT:    s_nop 0
569; GFX908-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
570; GFX908-NEXT:    ; implicit-def: $vgpr4
571; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
572; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
573; GFX908-NEXT:  ; %bb.2:
574; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
575; GFX908-NEXT:    s_mov_b64 s[6:7], 0
576; GFX908-NEXT:  .LBB2_3: ; %atomicrmw.start
577; GFX908-NEXT:    ; =>This Loop Header: Depth=1
578; GFX908-NEXT:    ; Child Loop BB2_4 Depth 2
579; GFX908-NEXT:    s_waitcnt vmcnt(0)
580; GFX908-NEXT:    v_add_f32_e32 v7, v8, v5
581; GFX908-NEXT:    v_mov_b32_e32 v6, v7
582; GFX908-NEXT:    s_mov_b64 s[12:13], exec
583; GFX908-NEXT:    v_mov_b32_e32 v7, v8
584; GFX908-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
585; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
586; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
587; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
588; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
589; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
590; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
591; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
592; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
593; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
594; GFX908-NEXT:    s_waitcnt vmcnt(0)
595; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
596; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
597; GFX908-NEXT:    s_cbranch_execnz .LBB2_4
598; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
599; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
600; GFX908-NEXT:    s_waitcnt vmcnt(0)
601; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
602; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
603; GFX908-NEXT:    v_mov_b32_e32 v8, v6
604; GFX908-NEXT:    buffer_wbinvl1
605; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
606; GFX908-NEXT:    s_cbranch_execnz .LBB2_3
607; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
608; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
609; GFX908-NEXT:    v_mov_b32_e32 v0, v6
610; GFX908-NEXT:    s_setpc_b64 s[30:31]
611;
612; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
613; GFX8:       ; %bb.0:
614; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
615; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x400, v4
616; GFX8-NEXT:    s_mov_b64 s[6:7], exec
617; GFX8-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
618; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
619; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
620; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
621; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
622; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
623; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
624; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
625; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
626; GFX8-NEXT:    s_nop 0
627; GFX8-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
628; GFX8-NEXT:    ; implicit-def: $vgpr4
629; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
630; GFX8-NEXT:    s_cbranch_execnz .LBB2_1
631; GFX8-NEXT:  ; %bb.2:
632; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
633; GFX8-NEXT:    s_mov_b64 s[6:7], 0
634; GFX8-NEXT:  .LBB2_3: ; %atomicrmw.start
635; GFX8-NEXT:    ; =>This Loop Header: Depth=1
636; GFX8-NEXT:    ; Child Loop BB2_4 Depth 2
637; GFX8-NEXT:    s_waitcnt vmcnt(0)
638; GFX8-NEXT:    v_add_f32_e32 v7, v8, v5
639; GFX8-NEXT:    v_mov_b32_e32 v6, v7
640; GFX8-NEXT:    s_mov_b64 s[12:13], exec
641; GFX8-NEXT:    v_mov_b32_e32 v7, v8
642; GFX8-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
643; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
644; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
645; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
646; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
647; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
648; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
649; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
650; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
651; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
652; GFX8-NEXT:    s_waitcnt vmcnt(0)
653; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
654; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
655; GFX8-NEXT:    s_cbranch_execnz .LBB2_4
656; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
657; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
658; GFX8-NEXT:    s_waitcnt vmcnt(0)
659; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
660; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
661; GFX8-NEXT:    v_mov_b32_e32 v8, v6
662; GFX8-NEXT:    buffer_wbinvl1
663; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
664; GFX8-NEXT:    s_cbranch_execnz .LBB2_3
665; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
666; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
667; GFX8-NEXT:    v_mov_b32_e32 v0, v6
668; GFX8-NEXT:    s_setpc_b64 s[30:31]
669;
670; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
671; GFX7:       ; %bb.0:
672; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
673; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
674; GFX7-NEXT:    s_mov_b64 s[6:7], exec
675; GFX7-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
676; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
677; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
678; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
679; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
680; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
681; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
682; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
683; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
684; GFX7-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
685; GFX7-NEXT:    ; implicit-def: $vgpr4
686; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
687; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
688; GFX7-NEXT:  ; %bb.2:
689; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
690; GFX7-NEXT:    s_mov_b64 s[6:7], 0
691; GFX7-NEXT:  .LBB2_3: ; %atomicrmw.start
692; GFX7-NEXT:    ; =>This Loop Header: Depth=1
693; GFX7-NEXT:    ; Child Loop BB2_4 Depth 2
694; GFX7-NEXT:    s_waitcnt vmcnt(0)
695; GFX7-NEXT:    v_add_f32_e32 v7, v8, v5
696; GFX7-NEXT:    v_mov_b32_e32 v6, v7
697; GFX7-NEXT:    s_mov_b64 s[12:13], exec
698; GFX7-NEXT:    v_mov_b32_e32 v7, v8
699; GFX7-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
700; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
701; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
702; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
703; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
704; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
705; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
706; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
707; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
708; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
709; GFX7-NEXT:    s_waitcnt vmcnt(0)
710; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
711; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
712; GFX7-NEXT:    s_cbranch_execnz .LBB2_4
713; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
714; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
715; GFX7-NEXT:    s_waitcnt vmcnt(0)
716; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
717; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
718; GFX7-NEXT:    v_mov_b32_e32 v8, v6
719; GFX7-NEXT:    buffer_wbinvl1
720; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
721; GFX7-NEXT:    s_cbranch_execnz .LBB2_3
722; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
723; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
724; GFX7-NEXT:    v_mov_b32_e32 v0, v6
725; GFX7-NEXT:    s_setpc_b64 s[30:31]
726;
727; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
728; GFX6:       ; %bb.0:
729; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
730; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
731; GFX6-NEXT:    s_mov_b64 s[6:7], exec
732; GFX6-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
733; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
734; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
735; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
736; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
737; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
738; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
739; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
740; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
741; GFX6-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
742; GFX6-NEXT:    ; implicit-def: $vgpr4
743; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
744; GFX6-NEXT:    s_cbranch_execnz .LBB2_1
745; GFX6-NEXT:  ; %bb.2:
746; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
747; GFX6-NEXT:    s_mov_b64 s[6:7], 0
748; GFX6-NEXT:  .LBB2_3: ; %atomicrmw.start
749; GFX6-NEXT:    ; =>This Loop Header: Depth=1
750; GFX6-NEXT:    ; Child Loop BB2_4 Depth 2
751; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
752; GFX6-NEXT:    v_add_f32_e32 v7, v8, v5
753; GFX6-NEXT:    v_mov_b32_e32 v6, v7
754; GFX6-NEXT:    s_mov_b64 s[12:13], exec
755; GFX6-NEXT:    v_mov_b32_e32 v7, v8
756; GFX6-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
757; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
758; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
759; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
760; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
761; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
762; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
763; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
764; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
765; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
766; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
767; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
768; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
769; GFX6-NEXT:    s_cbranch_execnz .LBB2_4
770; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
771; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
772; GFX6-NEXT:    s_waitcnt vmcnt(0)
773; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
774; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
775; GFX6-NEXT:    v_mov_b32_e32 v8, v6
776; GFX6-NEXT:    buffer_wbinvl1
777; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
778; GFX6-NEXT:    s_cbranch_execnz .LBB2_3
779; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
780; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
781; GFX6-NEXT:    v_mov_b32_e32 v0, v6
782; GFX6-NEXT:    s_waitcnt expcnt(0)
783; GFX6-NEXT:    s_setpc_b64 s[30:31]
784  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
785  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
786  ret float %result
787}
788
789define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
790; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
791; GFX12:       ; %bb.0:
792; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
793; GFX12-NEXT:    s_wait_expcnt 0x0
794; GFX12-NEXT:    s_wait_samplecnt 0x0
795; GFX12-NEXT:    s_wait_bvhcnt 0x0
796; GFX12-NEXT:    s_wait_kmcnt 0x0
797; GFX12-NEXT:    v_mov_b32_e32 v1, s16
798; GFX12-NEXT:    s_wait_storecnt 0x0
799; GFX12-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
800; GFX12-NEXT:    s_wait_loadcnt 0x0
801; GFX12-NEXT:    global_inv scope:SCOPE_DEV
802; GFX12-NEXT:    s_setpc_b64 s[30:31]
803;
804; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
805; GFX940:       ; %bb.0:
806; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
807; GFX940-NEXT:    v_mov_b32_e32 v1, s16
808; GFX940-NEXT:    buffer_wbl2 sc1
809; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
810; GFX940-NEXT:    s_waitcnt vmcnt(0)
811; GFX940-NEXT:    buffer_inv sc1
812; GFX940-NEXT:    s_setpc_b64 s[30:31]
813;
814; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
815; GFX11:       ; %bb.0:
816; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
817; GFX11-NEXT:    v_mov_b32_e32 v1, s16
818; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
819; GFX11-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
820; GFX11-NEXT:    s_waitcnt vmcnt(0)
821; GFX11-NEXT:    buffer_gl1_inv
822; GFX11-NEXT:    buffer_gl0_inv
823; GFX11-NEXT:    s_setpc_b64 s[30:31]
824;
825; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
826; GFX10:       ; %bb.0:
827; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
828; GFX10-NEXT:    v_mov_b32_e32 v2, v0
829; GFX10-NEXT:    v_mov_b32_e32 v0, s20
830; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
831; GFX10-NEXT:    v_mov_b32_e32 v3, s4
832; GFX10-NEXT:    s_mov_b32 s4, 0
833; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
834; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
835; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
836; GFX10-NEXT:    s_waitcnt vmcnt(0)
837; GFX10-NEXT:    v_mov_b32_e32 v5, v0
838; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
839; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
840; GFX10-NEXT:    v_mov_b32_e32 v0, v4
841; GFX10-NEXT:    v_mov_b32_e32 v1, v5
842; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
843; GFX10-NEXT:    s_waitcnt vmcnt(0)
844; GFX10-NEXT:    buffer_gl1_inv
845; GFX10-NEXT:    buffer_gl0_inv
846; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
847; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
848; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
849; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
850; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
851; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
852; GFX10-NEXT:    s_setpc_b64 s[30:31]
853;
854; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
855; GFX90A:       ; %bb.0:
856; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
857; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
858; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
859; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
860; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
861; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
862; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
863; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
864; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
865; GFX90A-NEXT:    s_waitcnt vmcnt(0)
866; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
867; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
868; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
869; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
870; GFX90A-NEXT:    s_waitcnt vmcnt(0)
871; GFX90A-NEXT:    buffer_wbinvl1
872; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
873; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
874; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
875; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
876; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
877; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
878; GFX90A-NEXT:    s_setpc_b64 s[30:31]
879;
880; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
881; GFX908:       ; %bb.0:
882; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
883; GFX908-NEXT:    v_mov_b32_e32 v2, v0
884; GFX908-NEXT:    v_mov_b32_e32 v0, s20
885; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
886; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
887; GFX908-NEXT:    s_mov_b64 s[4:5], 0
888; GFX908-NEXT:    v_mov_b32_e32 v3, s6
889; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
890; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
891; GFX908-NEXT:    s_waitcnt vmcnt(0)
892; GFX908-NEXT:    v_mov_b32_e32 v5, v0
893; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
894; GFX908-NEXT:    v_mov_b32_e32 v0, v4
895; GFX908-NEXT:    v_mov_b32_e32 v1, v5
896; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
897; GFX908-NEXT:    s_waitcnt vmcnt(0)
898; GFX908-NEXT:    buffer_wbinvl1
899; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
900; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
901; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
902; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
903; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
904; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
905; GFX908-NEXT:    s_setpc_b64 s[30:31]
906;
907; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
908; GFX8:       ; %bb.0:
909; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
910; GFX8-NEXT:    v_mov_b32_e32 v2, v0
911; GFX8-NEXT:    v_mov_b32_e32 v0, s20
912; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
913; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
914; GFX8-NEXT:    s_mov_b64 s[4:5], 0
915; GFX8-NEXT:    v_mov_b32_e32 v3, s6
916; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
917; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
918; GFX8-NEXT:    s_waitcnt vmcnt(0)
919; GFX8-NEXT:    v_mov_b32_e32 v5, v0
920; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
921; GFX8-NEXT:    v_mov_b32_e32 v0, v4
922; GFX8-NEXT:    v_mov_b32_e32 v1, v5
923; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
924; GFX8-NEXT:    s_waitcnt vmcnt(0)
925; GFX8-NEXT:    buffer_wbinvl1
926; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
927; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
928; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
929; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
930; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
931; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
932; GFX8-NEXT:    s_setpc_b64 s[30:31]
933;
934; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
935; GFX7:       ; %bb.0:
936; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
937; GFX7-NEXT:    v_mov_b32_e32 v2, v0
938; GFX7-NEXT:    v_mov_b32_e32 v0, s20
939; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
940; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
941; GFX7-NEXT:    s_mov_b64 s[4:5], 0
942; GFX7-NEXT:    v_mov_b32_e32 v3, s6
943; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
944; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
945; GFX7-NEXT:    s_waitcnt vmcnt(0)
946; GFX7-NEXT:    v_mov_b32_e32 v5, v0
947; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
948; GFX7-NEXT:    v_mov_b32_e32 v0, v4
949; GFX7-NEXT:    v_mov_b32_e32 v1, v5
950; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
951; GFX7-NEXT:    s_waitcnt vmcnt(0)
952; GFX7-NEXT:    buffer_wbinvl1
953; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
954; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
955; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
956; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
957; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
958; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
959; GFX7-NEXT:    s_setpc_b64 s[30:31]
960;
961; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
962; GFX6:       ; %bb.0:
963; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
964; GFX6-NEXT:    v_mov_b32_e32 v2, v0
965; GFX6-NEXT:    v_mov_b32_e32 v0, s20
966; GFX6-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
967; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
968; GFX6-NEXT:    s_mov_b64 s[4:5], 0
969; GFX6-NEXT:    v_mov_b32_e32 v3, s6
970; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
971; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
972; GFX6-NEXT:    s_waitcnt vmcnt(0)
973; GFX6-NEXT:    v_mov_b32_e32 v5, v0
974; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
975; GFX6-NEXT:    s_waitcnt expcnt(0)
976; GFX6-NEXT:    v_mov_b32_e32 v0, v4
977; GFX6-NEXT:    v_mov_b32_e32 v1, v5
978; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
979; GFX6-NEXT:    s_waitcnt vmcnt(0)
980; GFX6-NEXT:    buffer_wbinvl1
981; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
982; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
983; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
984; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
985; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
986; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
987; GFX6-NEXT:    s_waitcnt expcnt(0)
988; GFX6-NEXT:    s_setpc_b64 s[30:31]
989  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
990  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
991  ret float %result
992}
993
994define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
995; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
996; GFX12:       ; %bb.0:
997; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
998; GFX12-NEXT:    s_wait_expcnt 0x0
999; GFX12-NEXT:    s_wait_samplecnt 0x0
1000; GFX12-NEXT:    s_wait_bvhcnt 0x0
1001; GFX12-NEXT:    s_wait_kmcnt 0x0
1002; GFX12-NEXT:    v_mov_b32_e32 v1, s16
1003; GFX12-NEXT:    s_wait_storecnt 0x0
1004; GFX12-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024
1005; GFX12-NEXT:    s_wait_storecnt 0x0
1006; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1007; GFX12-NEXT:    s_setpc_b64 s[30:31]
1008;
1009; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
1010; GFX940:       ; %bb.0:
1011; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1012; GFX940-NEXT:    v_mov_b32_e32 v1, s16
1013; GFX940-NEXT:    buffer_wbl2 sc1
1014; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024
1015; GFX940-NEXT:    s_waitcnt vmcnt(0)
1016; GFX940-NEXT:    buffer_inv sc1
1017; GFX940-NEXT:    s_setpc_b64 s[30:31]
1018;
1019; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
1020; GFX11:       ; %bb.0:
1021; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1022; GFX11-NEXT:    v_mov_b32_e32 v1, s16
1023; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1024; GFX11-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024
1025; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1026; GFX11-NEXT:    buffer_gl1_inv
1027; GFX11-NEXT:    buffer_gl0_inv
1028; GFX11-NEXT:    s_setpc_b64 s[30:31]
1029;
1030; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
1031; GFX10:       ; %bb.0:
1032; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1033; GFX10-NEXT:    v_mov_b32_e32 v1, s20
1034; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
1035; GFX10-NEXT:    v_mov_b32_e32 v3, s4
1036; GFX10-NEXT:    s_mov_b32 s4, 0
1037; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
1038; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
1039; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1040; GFX10-NEXT:    s_waitcnt vmcnt(0)
1041; GFX10-NEXT:    v_add_f32_e32 v1, v2, v0
1042; GFX10-NEXT:    v_mov_b32_e32 v5, v2
1043; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1044; GFX10-NEXT:    v_mov_b32_e32 v4, v1
1045; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
1046; GFX10-NEXT:    s_waitcnt vmcnt(0)
1047; GFX10-NEXT:    buffer_gl1_inv
1048; GFX10-NEXT:    buffer_gl0_inv
1049; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
1050; GFX10-NEXT:    v_mov_b32_e32 v2, v4
1051; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1052; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1053; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
1054; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1055; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1056; GFX10-NEXT:    s_setpc_b64 s[30:31]
1057;
1058; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
1059; GFX90A:       ; %bb.0:
1060; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1061; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
1062; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
1063; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
1064; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1065; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
1066; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
1067; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1068; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1069; GFX90A-NEXT:    v_add_f32_e32 v2, v3, v0
1070; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
1071; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
1072; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1073; GFX90A-NEXT:    buffer_wbinvl1
1074; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
1075; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1076; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
1077; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1078; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
1079; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1080; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1081; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1082;
1083; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
1084; GFX908:       ; %bb.0:
1085; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1086; GFX908-NEXT:    v_mov_b32_e32 v1, s20
1087; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
1088; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
1089; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1090; GFX908-NEXT:    v_mov_b32_e32 v3, s6
1091; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
1092; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1093; GFX908-NEXT:    s_waitcnt vmcnt(0)
1094; GFX908-NEXT:    v_add_f32_e32 v1, v2, v0
1095; GFX908-NEXT:    v_mov_b32_e32 v5, v2
1096; GFX908-NEXT:    v_mov_b32_e32 v4, v1
1097; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
1098; GFX908-NEXT:    s_waitcnt vmcnt(0)
1099; GFX908-NEXT:    buffer_wbinvl1
1100; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
1101; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1102; GFX908-NEXT:    v_mov_b32_e32 v2, v4
1103; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1104; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
1105; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1106; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1107; GFX908-NEXT:    s_setpc_b64 s[30:31]
1108;
1109; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
1110; GFX8:       ; %bb.0:
1111; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1112; GFX8-NEXT:    v_mov_b32_e32 v1, s20
1113; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
1114; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
1115; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1116; GFX8-NEXT:    v_mov_b32_e32 v3, s6
1117; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
1118; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1119; GFX8-NEXT:    s_waitcnt vmcnt(0)
1120; GFX8-NEXT:    v_add_f32_e32 v1, v2, v0
1121; GFX8-NEXT:    v_mov_b32_e32 v5, v2
1122; GFX8-NEXT:    v_mov_b32_e32 v4, v1
1123; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
1124; GFX8-NEXT:    s_waitcnt vmcnt(0)
1125; GFX8-NEXT:    buffer_wbinvl1
1126; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
1127; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1128; GFX8-NEXT:    v_mov_b32_e32 v2, v4
1129; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1130; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
1131; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1132; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1133; GFX8-NEXT:    s_setpc_b64 s[30:31]
1134;
1135; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
1136; GFX7:       ; %bb.0:
1137; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1138; GFX7-NEXT:    v_mov_b32_e32 v1, s20
1139; GFX7-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
1140; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
1141; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1142; GFX7-NEXT:    v_mov_b32_e32 v3, s6
1143; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
1144; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1145; GFX7-NEXT:    s_waitcnt vmcnt(0)
1146; GFX7-NEXT:    v_add_f32_e32 v1, v2, v0
1147; GFX7-NEXT:    v_mov_b32_e32 v5, v2
1148; GFX7-NEXT:    v_mov_b32_e32 v4, v1
1149; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
1150; GFX7-NEXT:    s_waitcnt vmcnt(0)
1151; GFX7-NEXT:    buffer_wbinvl1
1152; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
1153; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1154; GFX7-NEXT:    v_mov_b32_e32 v2, v4
1155; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1156; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
1157; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1158; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1159; GFX7-NEXT:    s_setpc_b64 s[30:31]
1160;
1161; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
1162; GFX6:       ; %bb.0:
1163; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1164; GFX6-NEXT:    v_mov_b32_e32 v1, s20
1165; GFX6-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
1166; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
1167; GFX6-NEXT:    s_mov_b64 s[4:5], 0
1168; GFX6-NEXT:    v_mov_b32_e32 v3, s6
1169; GFX6-NEXT:  .LBB4_1: ; %atomicrmw.start
1170; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
1171; GFX6-NEXT:    s_waitcnt vmcnt(0)
1172; GFX6-NEXT:    v_add_f32_e32 v1, v2, v0
1173; GFX6-NEXT:    s_waitcnt expcnt(0)
1174; GFX6-NEXT:    v_mov_b32_e32 v5, v2
1175; GFX6-NEXT:    v_mov_b32_e32 v4, v1
1176; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
1177; GFX6-NEXT:    s_waitcnt vmcnt(0)
1178; GFX6-NEXT:    buffer_wbinvl1
1179; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
1180; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1181; GFX6-NEXT:    v_mov_b32_e32 v2, v4
1182; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1183; GFX6-NEXT:    s_cbranch_execnz .LBB4_1
1184; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
1185; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
1186; GFX6-NEXT:    s_waitcnt expcnt(0)
1187; GFX6-NEXT:    s_setpc_b64 s[30:31]
1188  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
1189  %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1190  ret void
1191}
1192
1193define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) {
1194; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
1195; GFX12:       ; %bb.0:
1196; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1197; GFX12-NEXT:    s_wait_expcnt 0x0
1198; GFX12-NEXT:    s_wait_samplecnt 0x0
1199; GFX12-NEXT:    s_wait_bvhcnt 0x0
1200; GFX12-NEXT:    s_wait_kmcnt 0x0
1201; GFX12-NEXT:    v_mov_b32_e32 v1, s16
1202; GFX12-NEXT:    s_wait_storecnt 0x0
1203; GFX12-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
1204; GFX12-NEXT:    s_wait_loadcnt 0x0
1205; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1206; GFX12-NEXT:    s_setpc_b64 s[30:31]
1207;
1208; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
1209; GFX940:       ; %bb.0:
1210; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1211; GFX940-NEXT:    v_mov_b32_e32 v1, s16
1212; GFX940-NEXT:    buffer_wbl2 sc1
1213; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
1214; GFX940-NEXT:    s_waitcnt vmcnt(0)
1215; GFX940-NEXT:    buffer_inv sc1
1216; GFX940-NEXT:    s_setpc_b64 s[30:31]
1217;
1218; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
1219; GFX11:       ; %bb.0:
1220; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1221; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
1222; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1223; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
1224; GFX11-NEXT:    v_mov_b32_e32 v0, s16
1225; GFX11-NEXT:    s_mov_b32 s4, 0
1226; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
1227; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
1228; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1229; GFX11-NEXT:    s_waitcnt vmcnt(0)
1230; GFX11-NEXT:    v_mov_b32_e32 v5, v0
1231; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1232; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1233; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
1234; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
1235; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
1236; GFX11-NEXT:    s_waitcnt vmcnt(0)
1237; GFX11-NEXT:    buffer_gl1_inv
1238; GFX11-NEXT:    buffer_gl0_inv
1239; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
1240; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
1241; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1242; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
1243; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
1244; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1245; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1246; GFX11-NEXT:    s_setpc_b64 s[30:31]
1247;
1248; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
1249; GFX10:       ; %bb.0:
1250; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1251; GFX10-NEXT:    v_mov_b32_e32 v2, v0
1252; GFX10-NEXT:    v_mov_b32_e32 v0, s20
1253; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
1254; GFX10-NEXT:    v_mov_b32_e32 v3, s4
1255; GFX10-NEXT:    s_mov_b32 s4, 0
1256; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1257; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
1258; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1259; GFX10-NEXT:    s_waitcnt vmcnt(0)
1260; GFX10-NEXT:    v_mov_b32_e32 v5, v0
1261; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1262; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
1263; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1264; GFX10-NEXT:    v_mov_b32_e32 v1, v5
1265; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1266; GFX10-NEXT:    s_waitcnt vmcnt(0)
1267; GFX10-NEXT:    buffer_gl1_inv
1268; GFX10-NEXT:    buffer_gl0_inv
1269; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
1270; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1271; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1272; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
1273; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1274; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1275; GFX10-NEXT:    s_setpc_b64 s[30:31]
1276;
1277; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
1278; GFX90A:       ; %bb.0:
1279; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1280; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
1281; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
1282; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1283; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
1284; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1285; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
1286; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
1287; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1288; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1289; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
1290; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
1291; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
1292; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1293; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1294; GFX90A-NEXT:    buffer_wbinvl1
1295; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1296; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1297; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1298; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
1299; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1300; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1301; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1302;
1303; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
1304; GFX908:       ; %bb.0:
1305; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1306; GFX908-NEXT:    v_mov_b32_e32 v2, v0
1307; GFX908-NEXT:    v_mov_b32_e32 v0, s20
1308; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1309; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
1310; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1311; GFX908-NEXT:    v_mov_b32_e32 v3, s6
1312; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
1313; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1314; GFX908-NEXT:    s_waitcnt vmcnt(0)
1315; GFX908-NEXT:    v_mov_b32_e32 v5, v0
1316; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
1317; GFX908-NEXT:    v_mov_b32_e32 v0, v4
1318; GFX908-NEXT:    v_mov_b32_e32 v1, v5
1319; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1320; GFX908-NEXT:    s_waitcnt vmcnt(0)
1321; GFX908-NEXT:    buffer_wbinvl1
1322; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1323; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1324; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1325; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
1326; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1327; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1328; GFX908-NEXT:    s_setpc_b64 s[30:31]
1329;
1330; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
1331; GFX8:       ; %bb.0:
1332; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1333; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1334; GFX8-NEXT:    v_mov_b32_e32 v0, s20
1335; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1336; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
1337; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1338; GFX8-NEXT:    v_mov_b32_e32 v3, s6
1339; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
1340; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1341; GFX8-NEXT:    s_waitcnt vmcnt(0)
1342; GFX8-NEXT:    v_mov_b32_e32 v5, v0
1343; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
1344; GFX8-NEXT:    v_mov_b32_e32 v0, v4
1345; GFX8-NEXT:    v_mov_b32_e32 v1, v5
1346; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1347; GFX8-NEXT:    s_waitcnt vmcnt(0)
1348; GFX8-NEXT:    buffer_wbinvl1
1349; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1350; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1351; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1352; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
1353; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1354; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1355; GFX8-NEXT:    s_setpc_b64 s[30:31]
1356;
1357; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
1358; GFX7:       ; %bb.0:
1359; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1360; GFX7-NEXT:    v_mov_b32_e32 v2, v0
1361; GFX7-NEXT:    v_mov_b32_e32 v0, s20
1362; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1363; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
1364; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1365; GFX7-NEXT:    v_mov_b32_e32 v3, s6
1366; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
1367; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1368; GFX7-NEXT:    s_waitcnt vmcnt(0)
1369; GFX7-NEXT:    v_mov_b32_e32 v5, v0
1370; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
1371; GFX7-NEXT:    v_mov_b32_e32 v0, v4
1372; GFX7-NEXT:    v_mov_b32_e32 v1, v5
1373; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1374; GFX7-NEXT:    s_waitcnt vmcnt(0)
1375; GFX7-NEXT:    buffer_wbinvl1
1376; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1377; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1378; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1379; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
1380; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1381; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1382; GFX7-NEXT:    s_setpc_b64 s[30:31]
1383;
1384; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
1385; GFX6:       ; %bb.0:
1386; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1387; GFX6-NEXT:    v_mov_b32_e32 v2, v0
1388; GFX6-NEXT:    v_mov_b32_e32 v0, s20
1389; GFX6-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1390; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
1391; GFX6-NEXT:    s_mov_b64 s[4:5], 0
1392; GFX6-NEXT:    v_mov_b32_e32 v3, s6
1393; GFX6-NEXT:  .LBB5_1: ; %atomicrmw.start
1394; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
1395; GFX6-NEXT:    s_waitcnt vmcnt(0)
1396; GFX6-NEXT:    v_mov_b32_e32 v5, v0
1397; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
1398; GFX6-NEXT:    s_waitcnt expcnt(0)
1399; GFX6-NEXT:    v_mov_b32_e32 v0, v4
1400; GFX6-NEXT:    v_mov_b32_e32 v1, v5
1401; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1402; GFX6-NEXT:    s_waitcnt vmcnt(0)
1403; GFX6-NEXT:    buffer_wbinvl1
1404; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1405; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1406; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1407; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
1408; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
1409; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
1410; GFX6-NEXT:    s_waitcnt expcnt(0)
1411; GFX6-NEXT:    s_setpc_b64 s[30:31]
1412  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
1413  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
1414  ret float %result
1415}
1416
1417define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
1418; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
1419; GFX12:       ; %bb.0:
1420; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1421; GFX12-NEXT:    s_wait_expcnt 0x0
1422; GFX12-NEXT:    s_wait_samplecnt 0x0
1423; GFX12-NEXT:    s_wait_bvhcnt 0x0
1424; GFX12-NEXT:    s_wait_kmcnt 0x0
1425; GFX12-NEXT:    v_mov_b32_e32 v1, s16
1426; GFX12-NEXT:    s_wait_storecnt 0x0
1427; GFX12-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
1428; GFX12-NEXT:    s_wait_loadcnt 0x0
1429; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1430; GFX12-NEXT:    s_setpc_b64 s[30:31]
1431;
1432; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
1433; GFX940:       ; %bb.0:
1434; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1435; GFX940-NEXT:    v_mov_b32_e32 v1, s16
1436; GFX940-NEXT:    buffer_wbl2 sc1
1437; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
1438; GFX940-NEXT:    s_waitcnt vmcnt(0)
1439; GFX940-NEXT:    buffer_inv sc1
1440; GFX940-NEXT:    s_setpc_b64 s[30:31]
1441;
1442; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
1443; GFX11:       ; %bb.0:
1444; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1445; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
1446; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1447; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
1448; GFX11-NEXT:    v_mov_b32_e32 v0, s16
1449; GFX11-NEXT:    s_mov_b32 s4, 0
1450; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
1451; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
1452; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1453; GFX11-NEXT:    s_waitcnt vmcnt(0)
1454; GFX11-NEXT:    v_mov_b32_e32 v5, v0
1455; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1456; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1457; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
1458; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
1459; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
1460; GFX11-NEXT:    s_waitcnt vmcnt(0)
1461; GFX11-NEXT:    buffer_gl1_inv
1462; GFX11-NEXT:    buffer_gl0_inv
1463; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
1464; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
1465; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1466; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
1467; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
1468; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1469; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1470; GFX11-NEXT:    s_setpc_b64 s[30:31]
1471;
1472; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
1473; GFX10:       ; %bb.0:
1474; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1475; GFX10-NEXT:    v_mov_b32_e32 v2, v0
1476; GFX10-NEXT:    v_mov_b32_e32 v0, s20
1477; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
1478; GFX10-NEXT:    v_mov_b32_e32 v3, s4
1479; GFX10-NEXT:    s_mov_b32 s4, 0
1480; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1481; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
1482; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1483; GFX10-NEXT:    s_waitcnt vmcnt(0)
1484; GFX10-NEXT:    v_mov_b32_e32 v5, v0
1485; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1486; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
1487; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1488; GFX10-NEXT:    v_mov_b32_e32 v1, v5
1489; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1490; GFX10-NEXT:    s_waitcnt vmcnt(0)
1491; GFX10-NEXT:    buffer_gl1_inv
1492; GFX10-NEXT:    buffer_gl0_inv
1493; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
1494; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1495; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1496; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
1497; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1498; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1499; GFX10-NEXT:    s_setpc_b64 s[30:31]
1500;
1501; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
1502; GFX90A:       ; %bb.0:
1503; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1504; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
1505; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
1506; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1507; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
1508; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1509; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
1510; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
1511; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1512; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1513; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
1514; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
1515; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
1516; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1517; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1518; GFX90A-NEXT:    buffer_wbinvl1
1519; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1520; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1521; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1522; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
1523; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1524; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1525; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1526;
1527; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
1528; GFX908:       ; %bb.0:
1529; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1530; GFX908-NEXT:    v_mov_b32_e32 v2, v0
1531; GFX908-NEXT:    v_mov_b32_e32 v0, s20
1532; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1533; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
1534; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1535; GFX908-NEXT:    v_mov_b32_e32 v3, s6
1536; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
1537; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1538; GFX908-NEXT:    s_waitcnt vmcnt(0)
1539; GFX908-NEXT:    v_mov_b32_e32 v5, v0
1540; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
1541; GFX908-NEXT:    v_mov_b32_e32 v0, v4
1542; GFX908-NEXT:    v_mov_b32_e32 v1, v5
1543; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1544; GFX908-NEXT:    s_waitcnt vmcnt(0)
1545; GFX908-NEXT:    buffer_wbinvl1
1546; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1547; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1548; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1549; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
1550; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1551; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1552; GFX908-NEXT:    s_setpc_b64 s[30:31]
1553;
1554; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
1555; GFX8:       ; %bb.0:
1556; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1557; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1558; GFX8-NEXT:    v_mov_b32_e32 v0, s20
1559; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1560; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
1561; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1562; GFX8-NEXT:    v_mov_b32_e32 v3, s6
1563; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
1564; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1565; GFX8-NEXT:    s_waitcnt vmcnt(0)
1566; GFX8-NEXT:    v_mov_b32_e32 v5, v0
1567; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
1568; GFX8-NEXT:    v_mov_b32_e32 v0, v4
1569; GFX8-NEXT:    v_mov_b32_e32 v1, v5
1570; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1571; GFX8-NEXT:    s_waitcnt vmcnt(0)
1572; GFX8-NEXT:    buffer_wbinvl1
1573; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1574; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1575; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1576; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
1577; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1578; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1579; GFX8-NEXT:    s_setpc_b64 s[30:31]
1580;
1581; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
1582; GFX7:       ; %bb.0:
1583; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1584; GFX7-NEXT:    v_mov_b32_e32 v2, v0
1585; GFX7-NEXT:    v_mov_b32_e32 v0, s20
1586; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1587; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
1588; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1589; GFX7-NEXT:    v_mov_b32_e32 v3, s6
1590; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
1591; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1592; GFX7-NEXT:    s_waitcnt vmcnt(0)
1593; GFX7-NEXT:    v_mov_b32_e32 v5, v0
1594; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
1595; GFX7-NEXT:    v_mov_b32_e32 v0, v4
1596; GFX7-NEXT:    v_mov_b32_e32 v1, v5
1597; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1598; GFX7-NEXT:    s_waitcnt vmcnt(0)
1599; GFX7-NEXT:    buffer_wbinvl1
1600; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1601; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1602; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1603; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
1604; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1605; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1606; GFX7-NEXT:    s_setpc_b64 s[30:31]
1607;
1608; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
1609; GFX6:       ; %bb.0:
1610; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1611; GFX6-NEXT:    v_mov_b32_e32 v2, v0
1612; GFX6-NEXT:    v_mov_b32_e32 v0, s20
1613; GFX6-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1614; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
1615; GFX6-NEXT:    s_mov_b64 s[4:5], 0
1616; GFX6-NEXT:    v_mov_b32_e32 v3, s6
1617; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
1618; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
1619; GFX6-NEXT:    s_waitcnt vmcnt(0)
1620; GFX6-NEXT:    v_mov_b32_e32 v5, v0
1621; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
1622; GFX6-NEXT:    s_waitcnt expcnt(0)
1623; GFX6-NEXT:    v_mov_b32_e32 v0, v4
1624; GFX6-NEXT:    v_mov_b32_e32 v1, v5
1625; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1626; GFX6-NEXT:    s_waitcnt vmcnt(0)
1627; GFX6-NEXT:    buffer_wbinvl1
1628; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1629; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1630; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1631; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
1632; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
1633; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
1634; GFX6-NEXT:    s_waitcnt expcnt(0)
1635; GFX6-NEXT:    s_setpc_b64 s[30:31]
1636  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
1637  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
1638  ret float %result
1639}
1640
1641define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 {
1642; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
1643; GFX12:       ; %bb.0:
1644; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1645; GFX12-NEXT:    s_wait_expcnt 0x0
1646; GFX12-NEXT:    s_wait_samplecnt 0x0
1647; GFX12-NEXT:    s_wait_bvhcnt 0x0
1648; GFX12-NEXT:    s_wait_kmcnt 0x0
1649; GFX12-NEXT:    v_mov_b32_e32 v1, s16
1650; GFX12-NEXT:    s_wait_storecnt 0x0
1651; GFX12-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
1652; GFX12-NEXT:    s_wait_loadcnt 0x0
1653; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1654; GFX12-NEXT:    s_setpc_b64 s[30:31]
1655;
1656; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
1657; GFX940:       ; %bb.0:
1658; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1659; GFX940-NEXT:    v_mov_b32_e32 v1, s16
1660; GFX940-NEXT:    buffer_wbl2 sc1
1661; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
1662; GFX940-NEXT:    s_waitcnt vmcnt(0)
1663; GFX940-NEXT:    buffer_inv sc1
1664; GFX940-NEXT:    s_setpc_b64 s[30:31]
1665;
1666; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
1667; GFX11:       ; %bb.0:
1668; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1669; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
1670; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1671; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
1672; GFX11-NEXT:    v_mov_b32_e32 v0, s16
1673; GFX11-NEXT:    s_mov_b32 s4, 0
1674; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
1675; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
1676; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1677; GFX11-NEXT:    s_waitcnt vmcnt(0)
1678; GFX11-NEXT:    v_mov_b32_e32 v5, v0
1679; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1680; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1681; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
1682; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
1683; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
1684; GFX11-NEXT:    s_waitcnt vmcnt(0)
1685; GFX11-NEXT:    buffer_gl1_inv
1686; GFX11-NEXT:    buffer_gl0_inv
1687; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
1688; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
1689; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1690; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
1691; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
1692; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1693; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1694; GFX11-NEXT:    s_setpc_b64 s[30:31]
1695;
1696; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
1697; GFX10:       ; %bb.0:
1698; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1699; GFX10-NEXT:    v_mov_b32_e32 v2, v0
1700; GFX10-NEXT:    v_mov_b32_e32 v0, s20
1701; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
1702; GFX10-NEXT:    v_mov_b32_e32 v3, s4
1703; GFX10-NEXT:    s_mov_b32 s4, 0
1704; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1705; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
1706; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1707; GFX10-NEXT:    s_waitcnt vmcnt(0)
1708; GFX10-NEXT:    v_mov_b32_e32 v5, v0
1709; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1710; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
1711; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1712; GFX10-NEXT:    v_mov_b32_e32 v1, v5
1713; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1714; GFX10-NEXT:    s_waitcnt vmcnt(0)
1715; GFX10-NEXT:    buffer_gl1_inv
1716; GFX10-NEXT:    buffer_gl0_inv
1717; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
1718; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1719; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1720; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
1721; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1722; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1723; GFX10-NEXT:    s_setpc_b64 s[30:31]
1724;
1725; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
1726; GFX90A:       ; %bb.0:
1727; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1728; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
1729; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
1730; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1731; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
1732; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1733; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
1734; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
1735; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1736; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1737; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
1738; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
1739; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
1740; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1741; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1742; GFX90A-NEXT:    buffer_wbinvl1
1743; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1744; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1745; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1746; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
1747; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1748; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1749; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1750;
1751; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
1752; GFX908:       ; %bb.0:
1753; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1754; GFX908-NEXT:    v_mov_b32_e32 v2, v0
1755; GFX908-NEXT:    v_mov_b32_e32 v0, s20
1756; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1757; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
1758; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1759; GFX908-NEXT:    v_mov_b32_e32 v3, s6
1760; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
1761; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1762; GFX908-NEXT:    s_waitcnt vmcnt(0)
1763; GFX908-NEXT:    v_mov_b32_e32 v5, v0
1764; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
1765; GFX908-NEXT:    v_mov_b32_e32 v0, v4
1766; GFX908-NEXT:    v_mov_b32_e32 v1, v5
1767; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1768; GFX908-NEXT:    s_waitcnt vmcnt(0)
1769; GFX908-NEXT:    buffer_wbinvl1
1770; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1771; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1772; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1773; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
1774; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1775; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1776; GFX908-NEXT:    s_setpc_b64 s[30:31]
1777;
1778; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
1779; GFX8:       ; %bb.0:
1780; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1781; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1782; GFX8-NEXT:    v_mov_b32_e32 v0, s20
1783; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1784; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
1785; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1786; GFX8-NEXT:    v_mov_b32_e32 v3, s6
1787; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
1788; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1789; GFX8-NEXT:    s_waitcnt vmcnt(0)
1790; GFX8-NEXT:    v_mov_b32_e32 v5, v0
1791; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
1792; GFX8-NEXT:    v_mov_b32_e32 v0, v4
1793; GFX8-NEXT:    v_mov_b32_e32 v1, v5
1794; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1795; GFX8-NEXT:    s_waitcnt vmcnt(0)
1796; GFX8-NEXT:    buffer_wbinvl1
1797; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1798; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1799; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1800; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
1801; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1802; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1803; GFX8-NEXT:    s_setpc_b64 s[30:31]
1804;
1805; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
1806; GFX7:       ; %bb.0:
1807; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1808; GFX7-NEXT:    v_mov_b32_e32 v2, v0
1809; GFX7-NEXT:    v_mov_b32_e32 v0, s20
1810; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1811; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
1812; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1813; GFX7-NEXT:    v_mov_b32_e32 v3, s6
1814; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
1815; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1816; GFX7-NEXT:    s_waitcnt vmcnt(0)
1817; GFX7-NEXT:    v_mov_b32_e32 v5, v0
1818; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
1819; GFX7-NEXT:    v_mov_b32_e32 v0, v4
1820; GFX7-NEXT:    v_mov_b32_e32 v1, v5
1821; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1822; GFX7-NEXT:    s_waitcnt vmcnt(0)
1823; GFX7-NEXT:    buffer_wbinvl1
1824; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1825; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1826; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1827; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
1828; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1829; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1830; GFX7-NEXT:    s_setpc_b64 s[30:31]
1831;
1832; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
1833; GFX6:       ; %bb.0:
1834; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1835; GFX6-NEXT:    v_mov_b32_e32 v2, v0
1836; GFX6-NEXT:    v_mov_b32_e32 v0, s20
1837; GFX6-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1838; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
1839; GFX6-NEXT:    s_mov_b64 s[4:5], 0
1840; GFX6-NEXT:    v_mov_b32_e32 v3, s6
1841; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
1842; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
1843; GFX6-NEXT:    s_waitcnt vmcnt(0)
1844; GFX6-NEXT:    v_mov_b32_e32 v5, v0
1845; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
1846; GFX6-NEXT:    s_waitcnt expcnt(0)
1847; GFX6-NEXT:    v_mov_b32_e32 v0, v4
1848; GFX6-NEXT:    v_mov_b32_e32 v1, v5
1849; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1850; GFX6-NEXT:    s_waitcnt vmcnt(0)
1851; GFX6-NEXT:    buffer_wbinvl1
1852; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1853; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1854; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1855; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
1856; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
1857; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
1858; GFX6-NEXT:    s_waitcnt expcnt(0)
1859; GFX6-NEXT:    s_setpc_b64 s[30:31]
1860  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
1861  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
1862  ret float %result
1863}
1864
1865; --------------------------------------------------------------------
1866; double
1867; --------------------------------------------------------------------
1868
1869define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
1870; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
1871; GFX12:       ; %bb.0:
1872; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1873; GFX12-NEXT:    s_wait_expcnt 0x0
1874; GFX12-NEXT:    s_wait_samplecnt 0x0
1875; GFX12-NEXT:    s_wait_bvhcnt 0x0
1876; GFX12-NEXT:    s_wait_kmcnt 0x0
1877; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
1878; GFX12-NEXT:    v_mov_b32_e32 v0, s16
1879; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
1880; GFX12-NEXT:    s_wait_alu 0xfffe
1881; GFX12-NEXT:    v_mov_b32_e32 v6, s4
1882; GFX12-NEXT:    s_mov_b32 s4, 0
1883; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
1884; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
1885; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1886; GFX12-NEXT:    s_wait_loadcnt 0x0
1887; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1888; GFX12-NEXT:    s_wait_storecnt 0x0
1889; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1890; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
1891; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1892; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1893; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
1894; GFX12-NEXT:    s_wait_loadcnt 0x0
1895; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1896; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
1897; GFX12-NEXT:    s_wait_alu 0xfffe
1898; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
1899; GFX12-NEXT:    s_wait_alu 0xfffe
1900; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
1901; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
1902; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1903; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1904; GFX12-NEXT:    s_wait_alu 0xfffe
1905; GFX12-NEXT:    s_setpc_b64 s[30:31]
1906;
1907; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
1908; GFX940:       ; %bb.0:
1909; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1910; GFX940-NEXT:    v_mov_b32_e32 v2, s16
1911; GFX940-NEXT:    buffer_wbl2 sc1
1912; GFX940-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
1913; GFX940-NEXT:    s_waitcnt vmcnt(0)
1914; GFX940-NEXT:    buffer_inv sc1
1915; GFX940-NEXT:    s_setpc_b64 s[30:31]
1916;
1917; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
1918; GFX11:       ; %bb.0:
1919; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1920; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
1921; GFX11-NEXT:    v_mov_b32_e32 v0, s16
1922; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
1923; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1924; GFX11-NEXT:    v_mov_b32_e32 v6, s4
1925; GFX11-NEXT:    s_mov_b32 s4, 0
1926; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
1927; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
1928; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1929; GFX11-NEXT:    s_waitcnt vmcnt(0)
1930; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1931; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1932; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1933; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
1934; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1935; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1936; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
1937; GFX11-NEXT:    s_waitcnt vmcnt(0)
1938; GFX11-NEXT:    buffer_gl1_inv
1939; GFX11-NEXT:    buffer_gl0_inv
1940; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
1941; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
1942; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1943; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
1944; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
1945; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1946; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1947; GFX11-NEXT:    s_setpc_b64 s[30:31]
1948;
1949; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
1950; GFX10:       ; %bb.0:
1951; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1952; GFX10-NEXT:    v_mov_b32_e32 v4, v0
1953; GFX10-NEXT:    v_mov_b32_e32 v0, s20
1954; GFX10-NEXT:    v_mov_b32_e32 v5, v1
1955; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
1956; GFX10-NEXT:    v_mov_b32_e32 v6, s4
1957; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
1958; GFX10-NEXT:    s_mov_b32 s4, 0
1959; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
1960; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1961; GFX10-NEXT:    s_waitcnt vmcnt(0)
1962; GFX10-NEXT:    v_mov_b32_e32 v10, v1
1963; GFX10-NEXT:    v_mov_b32_e32 v9, v0
1964; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1965; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
1966; GFX10-NEXT:    v_mov_b32_e32 v0, v7
1967; GFX10-NEXT:    v_mov_b32_e32 v1, v8
1968; GFX10-NEXT:    v_mov_b32_e32 v2, v9
1969; GFX10-NEXT:    v_mov_b32_e32 v3, v10
1970; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
1971; GFX10-NEXT:    s_waitcnt vmcnt(0)
1972; GFX10-NEXT:    buffer_gl1_inv
1973; GFX10-NEXT:    buffer_gl0_inv
1974; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
1975; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1976; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1977; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
1978; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1979; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1980; GFX10-NEXT:    s_setpc_b64 s[30:31]
1981;
1982; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
1983; GFX90A:       ; %bb.0:
1984; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1985; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
1986; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc
1987; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1988; GFX90A-NEXT:    buffer_wbinvl1
1989; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1990;
1991; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
1992; GFX908:       ; %bb.0:
1993; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1994; GFX908-NEXT:    v_mov_b32_e32 v4, v0
1995; GFX908-NEXT:    v_mov_b32_e32 v0, s20
1996; GFX908-NEXT:    v_mov_b32_e32 v5, v1
1997; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
1998; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
1999; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2000; GFX908-NEXT:    v_mov_b32_e32 v6, s6
2001; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
2002; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2003; GFX908-NEXT:    s_waitcnt vmcnt(0)
2004; GFX908-NEXT:    v_mov_b32_e32 v10, v1
2005; GFX908-NEXT:    v_mov_b32_e32 v9, v0
2006; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
2007; GFX908-NEXT:    v_mov_b32_e32 v0, v7
2008; GFX908-NEXT:    v_mov_b32_e32 v1, v8
2009; GFX908-NEXT:    v_mov_b32_e32 v2, v9
2010; GFX908-NEXT:    v_mov_b32_e32 v3, v10
2011; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
2012; GFX908-NEXT:    s_waitcnt vmcnt(0)
2013; GFX908-NEXT:    buffer_wbinvl1
2014; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
2015; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2016; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2017; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
2018; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2019; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2020; GFX908-NEXT:    s_setpc_b64 s[30:31]
2021;
2022; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
2023; GFX8:       ; %bb.0:
2024; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2025; GFX8-NEXT:    v_mov_b32_e32 v4, v0
2026; GFX8-NEXT:    v_mov_b32_e32 v0, s20
2027; GFX8-NEXT:    v_mov_b32_e32 v5, v1
2028; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
2029; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
2030; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2031; GFX8-NEXT:    v_mov_b32_e32 v6, s6
2032; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
2033; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2034; GFX8-NEXT:    s_waitcnt vmcnt(0)
2035; GFX8-NEXT:    v_mov_b32_e32 v10, v1
2036; GFX8-NEXT:    v_mov_b32_e32 v9, v0
2037; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
2038; GFX8-NEXT:    v_mov_b32_e32 v0, v7
2039; GFX8-NEXT:    v_mov_b32_e32 v1, v8
2040; GFX8-NEXT:    v_mov_b32_e32 v2, v9
2041; GFX8-NEXT:    v_mov_b32_e32 v3, v10
2042; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
2043; GFX8-NEXT:    s_waitcnt vmcnt(0)
2044; GFX8-NEXT:    buffer_wbinvl1
2045; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
2046; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2047; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2048; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
2049; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2050; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2051; GFX8-NEXT:    s_setpc_b64 s[30:31]
2052;
2053; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
2054; GFX7:       ; %bb.0:
2055; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2056; GFX7-NEXT:    v_mov_b32_e32 v4, v0
2057; GFX7-NEXT:    v_mov_b32_e32 v0, s20
2058; GFX7-NEXT:    v_mov_b32_e32 v5, v1
2059; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
2060; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
2061; GFX7-NEXT:    s_mov_b64 s[4:5], 0
2062; GFX7-NEXT:    v_mov_b32_e32 v6, s6
2063; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
2064; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2065; GFX7-NEXT:    s_waitcnt vmcnt(0)
2066; GFX7-NEXT:    v_mov_b32_e32 v10, v1
2067; GFX7-NEXT:    v_mov_b32_e32 v9, v0
2068; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
2069; GFX7-NEXT:    v_mov_b32_e32 v0, v7
2070; GFX7-NEXT:    v_mov_b32_e32 v1, v8
2071; GFX7-NEXT:    v_mov_b32_e32 v2, v9
2072; GFX7-NEXT:    v_mov_b32_e32 v3, v10
2073; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
2074; GFX7-NEXT:    s_waitcnt vmcnt(0)
2075; GFX7-NEXT:    buffer_wbinvl1
2076; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
2077; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2078; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2079; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
2080; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2081; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2082; GFX7-NEXT:    s_setpc_b64 s[30:31]
2083;
2084; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
2085; GFX6:       ; %bb.0:
2086; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2087; GFX6-NEXT:    v_mov_b32_e32 v4, v0
2088; GFX6-NEXT:    v_mov_b32_e32 v0, s20
2089; GFX6-NEXT:    v_mov_b32_e32 v5, v1
2090; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
2091; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
2092; GFX6-NEXT:    s_mov_b64 s[4:5], 0
2093; GFX6-NEXT:    v_mov_b32_e32 v6, s6
2094; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
2095; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
2096; GFX6-NEXT:    s_waitcnt vmcnt(0)
2097; GFX6-NEXT:    v_mov_b32_e32 v10, v1
2098; GFX6-NEXT:    v_mov_b32_e32 v9, v0
2099; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
2100; GFX6-NEXT:    s_waitcnt expcnt(0)
2101; GFX6-NEXT:    v_mov_b32_e32 v0, v7
2102; GFX6-NEXT:    v_mov_b32_e32 v1, v8
2103; GFX6-NEXT:    v_mov_b32_e32 v2, v9
2104; GFX6-NEXT:    v_mov_b32_e32 v3, v10
2105; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
2106; GFX6-NEXT:    s_waitcnt vmcnt(0)
2107; GFX6-NEXT:    buffer_wbinvl1
2108; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
2109; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2110; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2111; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
2112; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
2113; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
2114; GFX6-NEXT:    s_waitcnt expcnt(0)
2115; GFX6-NEXT:    s_setpc_b64 s[30:31]
2116  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
2117  %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2118  ret double %result
2119}
2120
2121define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
2122; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
2123; GFX12:       ; %bb.0:
2124; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2125; GFX12-NEXT:    s_wait_expcnt 0x0
2126; GFX12-NEXT:    s_wait_samplecnt 0x0
2127; GFX12-NEXT:    s_wait_bvhcnt 0x0
2128; GFX12-NEXT:    s_wait_kmcnt 0x0
2129; GFX12-NEXT:    v_mov_b32_e32 v2, s16
2130; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
2131; GFX12-NEXT:    s_wait_alu 0xfffe
2132; GFX12-NEXT:    v_mov_b32_e32 v6, s4
2133; GFX12-NEXT:    s_mov_b32 s4, 0
2134; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
2135; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
2136; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2137; GFX12-NEXT:    s_wait_loadcnt 0x0
2138; GFX12-NEXT:    v_add_f64_e32 v[2:3], v[4:5], v[0:1]
2139; GFX12-NEXT:    v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
2140; GFX12-NEXT:    s_wait_storecnt 0x0
2141; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2142; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
2143; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
2144; GFX12-NEXT:    s_wait_loadcnt 0x0
2145; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2146; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
2147; GFX12-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
2148; GFX12-NEXT:    s_wait_alu 0xfffe
2149; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
2150; GFX12-NEXT:    s_wait_alu 0xfffe
2151; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
2152; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
2153; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2154; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2155; GFX12-NEXT:    s_wait_alu 0xfffe
2156; GFX12-NEXT:    s_setpc_b64 s[30:31]
2157;
2158; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
2159; GFX940:       ; %bb.0:
2160; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2161; GFX940-NEXT:    v_mov_b32_e32 v2, s16
2162; GFX940-NEXT:    buffer_wbl2 sc1
2163; GFX940-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
2164; GFX940-NEXT:    s_waitcnt vmcnt(0)
2165; GFX940-NEXT:    buffer_inv sc1
2166; GFX940-NEXT:    s_setpc_b64 s[30:31]
2167;
2168; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
2169; GFX11:       ; %bb.0:
2170; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2171; GFX11-NEXT:    v_mov_b32_e32 v2, s16
2172; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
2173; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2174; GFX11-NEXT:    v_mov_b32_e32 v6, s4
2175; GFX11-NEXT:    s_mov_b32 s4, 0
2176; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
2177; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
2178; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2179; GFX11-NEXT:    s_waitcnt vmcnt(0)
2180; GFX11-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
2181; GFX11-NEXT:    v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
2182; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2183; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2184; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
2185; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
2186; GFX11-NEXT:    s_waitcnt vmcnt(0)
2187; GFX11-NEXT:    buffer_gl1_inv
2188; GFX11-NEXT:    buffer_gl0_inv
2189; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
2190; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
2191; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
2192; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2193; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
2194; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
2195; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2196; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2197; GFX11-NEXT:    s_setpc_b64 s[30:31]
2198;
2199; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
2200; GFX10:       ; %bb.0:
2201; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2202; GFX10-NEXT:    v_mov_b32_e32 v2, s20
2203; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
2204; GFX10-NEXT:    v_mov_b32_e32 v6, s4
2205; GFX10-NEXT:    s_mov_b32 s4, 0
2206; GFX10-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
2207; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
2208; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2209; GFX10-NEXT:    s_waitcnt vmcnt(0)
2210; GFX10-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
2211; GFX10-NEXT:    v_mov_b32_e32 v10, v5
2212; GFX10-NEXT:    v_mov_b32_e32 v9, v4
2213; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2214; GFX10-NEXT:    v_mov_b32_e32 v8, v3
2215; GFX10-NEXT:    v_mov_b32_e32 v7, v2
2216; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
2217; GFX10-NEXT:    s_waitcnt vmcnt(0)
2218; GFX10-NEXT:    buffer_gl1_inv
2219; GFX10-NEXT:    buffer_gl0_inv
2220; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
2221; GFX10-NEXT:    v_mov_b32_e32 v4, v7
2222; GFX10-NEXT:    v_mov_b32_e32 v5, v8
2223; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
2224; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
2225; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
2226; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2227; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2228; GFX10-NEXT:    s_setpc_b64 s[30:31]
2229;
2230; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
2231; GFX90A:       ; %bb.0:
2232; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2233; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
2234; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[16:19], 0 offen offset:2048
2235; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2236; GFX90A-NEXT:    buffer_wbinvl1
2237; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2238;
2239; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
2240; GFX908:       ; %bb.0:
2241; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2242; GFX908-NEXT:    v_mov_b32_e32 v2, s20
2243; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
2244; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
2245; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2246; GFX908-NEXT:    v_mov_b32_e32 v6, s6
2247; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
2248; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2249; GFX908-NEXT:    s_waitcnt vmcnt(0)
2250; GFX908-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
2251; GFX908-NEXT:    v_mov_b32_e32 v10, v5
2252; GFX908-NEXT:    v_mov_b32_e32 v9, v4
2253; GFX908-NEXT:    v_mov_b32_e32 v8, v3
2254; GFX908-NEXT:    v_mov_b32_e32 v7, v2
2255; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
2256; GFX908-NEXT:    s_waitcnt vmcnt(0)
2257; GFX908-NEXT:    buffer_wbinvl1
2258; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
2259; GFX908-NEXT:    v_mov_b32_e32 v4, v7
2260; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2261; GFX908-NEXT:    v_mov_b32_e32 v5, v8
2262; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2263; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
2264; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2265; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2266; GFX908-NEXT:    s_setpc_b64 s[30:31]
2267;
2268; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
2269; GFX8:       ; %bb.0:
2270; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2271; GFX8-NEXT:    v_mov_b32_e32 v2, s20
2272; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
2273; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
2274; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2275; GFX8-NEXT:    v_mov_b32_e32 v6, s6
2276; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
2277; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2278; GFX8-NEXT:    s_waitcnt vmcnt(0)
2279; GFX8-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
2280; GFX8-NEXT:    v_mov_b32_e32 v10, v5
2281; GFX8-NEXT:    v_mov_b32_e32 v9, v4
2282; GFX8-NEXT:    v_mov_b32_e32 v8, v3
2283; GFX8-NEXT:    v_mov_b32_e32 v7, v2
2284; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
2285; GFX8-NEXT:    s_waitcnt vmcnt(0)
2286; GFX8-NEXT:    buffer_wbinvl1
2287; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
2288; GFX8-NEXT:    v_mov_b32_e32 v4, v7
2289; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2290; GFX8-NEXT:    v_mov_b32_e32 v5, v8
2291; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2292; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
2293; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2294; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2295; GFX8-NEXT:    s_setpc_b64 s[30:31]
2296;
2297; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
2298; GFX7:       ; %bb.0:
2299; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2300; GFX7-NEXT:    v_mov_b32_e32 v2, s20
2301; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
2302; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
2303; GFX7-NEXT:    s_mov_b64 s[4:5], 0
2304; GFX7-NEXT:    v_mov_b32_e32 v6, s6
2305; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
2306; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2307; GFX7-NEXT:    s_waitcnt vmcnt(0)
2308; GFX7-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
2309; GFX7-NEXT:    v_mov_b32_e32 v10, v5
2310; GFX7-NEXT:    v_mov_b32_e32 v9, v4
2311; GFX7-NEXT:    v_mov_b32_e32 v8, v3
2312; GFX7-NEXT:    v_mov_b32_e32 v7, v2
2313; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
2314; GFX7-NEXT:    s_waitcnt vmcnt(0)
2315; GFX7-NEXT:    buffer_wbinvl1
2316; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
2317; GFX7-NEXT:    v_mov_b32_e32 v4, v7
2318; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2319; GFX7-NEXT:    v_mov_b32_e32 v5, v8
2320; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2321; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
2322; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2323; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2324; GFX7-NEXT:    s_setpc_b64 s[30:31]
2325;
2326; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
2327; GFX6:       ; %bb.0:
2328; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2329; GFX6-NEXT:    v_mov_b32_e32 v2, s20
2330; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
2331; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
2332; GFX6-NEXT:    s_mov_b64 s[4:5], 0
2333; GFX6-NEXT:    v_mov_b32_e32 v6, s6
2334; GFX6-NEXT:  .LBB9_1: ; %atomicrmw.start
2335; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
2336; GFX6-NEXT:    s_waitcnt vmcnt(0)
2337; GFX6-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
2338; GFX6-NEXT:    s_waitcnt expcnt(0)
2339; GFX6-NEXT:    v_mov_b32_e32 v10, v5
2340; GFX6-NEXT:    v_mov_b32_e32 v9, v4
2341; GFX6-NEXT:    v_mov_b32_e32 v8, v3
2342; GFX6-NEXT:    v_mov_b32_e32 v7, v2
2343; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
2344; GFX6-NEXT:    s_waitcnt vmcnt(0)
2345; GFX6-NEXT:    buffer_wbinvl1
2346; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
2347; GFX6-NEXT:    v_mov_b32_e32 v4, v7
2348; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2349; GFX6-NEXT:    v_mov_b32_e32 v5, v8
2350; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2351; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
2352; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
2353; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
2354; GFX6-NEXT:    s_waitcnt expcnt(0)
2355; GFX6-NEXT:    s_setpc_b64 s[30:31]
2356  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
2357  %unused = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2358  ret void
2359}
2360
2361define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 {
2362; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
2363; GFX12:       ; %bb.0:
2364; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2365; GFX12-NEXT:    s_wait_expcnt 0x0
2366; GFX12-NEXT:    s_wait_samplecnt 0x0
2367; GFX12-NEXT:    s_wait_bvhcnt 0x0
2368; GFX12-NEXT:    s_wait_kmcnt 0x0
2369; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
2370; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
2371; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
2372; GFX12-NEXT:    s_mov_b32 s1, exec_lo
2373; GFX12-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
2374; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
2375; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
2376; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
2377; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
2378; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
2379; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2380; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
2381; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
2382; GFX12-NEXT:    s_wait_alu 0xfffe
2383; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2384; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
2385; GFX12-NEXT:    s_wait_alu 0xfffe
2386; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
2387; GFX12-NEXT:    s_wait_loadcnt 0x0
2388; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
2389; GFX12-NEXT:    ; implicit-def: $vgpr4
2390; GFX12-NEXT:    s_wait_alu 0xfffe
2391; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
2392; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
2393; GFX12-NEXT:  ; %bb.2:
2394; GFX12-NEXT:    s_mov_b32 exec_lo, s1
2395; GFX12-NEXT:    s_mov_b32 s1, 0
2396; GFX12-NEXT:  .LBB10_3: ; %atomicrmw.start
2397; GFX12-NEXT:    ; =>This Loop Header: Depth=1
2398; GFX12-NEXT:    ; Child Loop BB10_4 Depth 2
2399; GFX12-NEXT:    s_wait_loadcnt 0x0
2400; GFX12-NEXT:    v_add_f64_e32 v[11:12], v[13:14], v[5:6]
2401; GFX12-NEXT:    s_mov_b32 s2, exec_lo
2402; GFX12-NEXT:    s_wait_storecnt 0x0
2403; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2404; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
2405; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
2406; GFX12-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
2407; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
2408; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
2409; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
2410; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
2411; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
2412; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2413; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
2414; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
2415; GFX12-NEXT:    s_wait_alu 0xfffe
2416; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2417; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
2418; GFX12-NEXT:    s_wait_alu 0xfffe
2419; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
2420; GFX12-NEXT:    s_wait_loadcnt 0x0
2421; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
2422; GFX12-NEXT:    s_wait_alu 0xfffe
2423; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
2424; GFX12-NEXT:    s_cbranch_execnz .LBB10_4
2425; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
2426; GFX12-NEXT:    s_mov_b32 exec_lo, s2
2427; GFX12-NEXT:    s_wait_loadcnt 0x0
2428; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
2429; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
2430; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2431; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
2432; GFX12-NEXT:    s_wait_alu 0xfffe
2433; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
2434; GFX12-NEXT:    s_cbranch_execnz .LBB10_3
2435; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
2436; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2437; GFX12-NEXT:    s_wait_alu 0xfffe
2438; GFX12-NEXT:    s_setpc_b64 s[30:31]
2439;
2440; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
2441; GFX940:       ; %bb.0:
2442; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2443; GFX940-NEXT:    v_mov_b32_e32 v7, v6
2444; GFX940-NEXT:    v_mov_b32_e32 v6, v5
2445; GFX940-NEXT:    s_mov_b64 s[2:3], exec
2446; GFX940-NEXT:    buffer_wbl2 sc1
2447; GFX940-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
2448; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
2449; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
2450; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
2451; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
2452; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
2453; GFX940-NEXT:    s_nop 0
2454; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
2455; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
2456; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
2457; GFX940-NEXT:    s_waitcnt vmcnt(0)
2458; GFX940-NEXT:    buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
2459; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2460; GFX940-NEXT:    ; implicit-def: $vgpr4
2461; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
2462; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
2463; GFX940-NEXT:  ; %bb.2:
2464; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
2465; GFX940-NEXT:    s_waitcnt vmcnt(0)
2466; GFX940-NEXT:    v_mov_b32_e32 v0, v6
2467; GFX940-NEXT:    v_mov_b32_e32 v1, v7
2468; GFX940-NEXT:    buffer_inv sc1
2469; GFX940-NEXT:    s_setpc_b64 s[30:31]
2470;
2471; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
2472; GFX11:       ; %bb.0:
2473; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2474; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
2475; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
2476; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
2477; GFX11-NEXT:    s_mov_b32 s1, 0
2478; GFX11-NEXT:    s_mov_b32 s2, exec_lo
2479; GFX11-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
2480; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
2481; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
2482; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
2483; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
2484; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
2485; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
2486; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2487; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
2488; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
2489; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2490; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
2491; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
2492; GFX11-NEXT:    ; implicit-def: $vgpr4
2493; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
2494; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
2495; GFX11-NEXT:  ; %bb.2:
2496; GFX11-NEXT:    s_mov_b32 exec_lo, s2
2497; GFX11-NEXT:    .p2align 6
2498; GFX11-NEXT:  .LBB10_3: ; %atomicrmw.start
2499; GFX11-NEXT:    ; =>This Loop Header: Depth=1
2500; GFX11-NEXT:    ; Child Loop BB10_4 Depth 2
2501; GFX11-NEXT:    s_waitcnt vmcnt(0)
2502; GFX11-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
2503; GFX11-NEXT:    s_mov_b32 s2, exec_lo
2504; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2505; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2506; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
2507; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
2508; GFX11-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
2509; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
2510; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
2511; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
2512; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
2513; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
2514; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2515; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
2516; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
2517; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2518; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
2519; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
2520; GFX11-NEXT:    s_waitcnt vmcnt(0)
2521; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
2522; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
2523; GFX11-NEXT:    s_cbranch_execnz .LBB10_4
2524; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
2525; GFX11-NEXT:    s_mov_b32 exec_lo, s2
2526; GFX11-NEXT:    s_waitcnt vmcnt(0)
2527; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
2528; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
2529; GFX11-NEXT:    buffer_gl1_inv
2530; GFX11-NEXT:    buffer_gl0_inv
2531; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
2532; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2533; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
2534; GFX11-NEXT:    s_cbranch_execnz .LBB10_3
2535; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
2536; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2537; GFX11-NEXT:    s_setpc_b64 s[30:31]
2538;
2539; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
2540; GFX10:       ; %bb.0:
2541; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2542; GFX10-NEXT:    v_mov_b32_e32 v8, v3
2543; GFX10-NEXT:    v_mov_b32_e32 v7, v2
2544; GFX10-NEXT:    v_mov_b32_e32 v10, v1
2545; GFX10-NEXT:    v_mov_b32_e32 v9, v0
2546; GFX10-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
2547; GFX10-NEXT:    s_mov_b32 s5, 0
2548; GFX10-NEXT:    s_mov_b32 s6, exec_lo
2549; GFX10-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
2550; GFX10-NEXT:    v_readfirstlane_b32 s8, v9
2551; GFX10-NEXT:    v_readfirstlane_b32 s9, v10
2552; GFX10-NEXT:    v_readfirstlane_b32 s10, v7
2553; GFX10-NEXT:    v_readfirstlane_b32 s11, v8
2554; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
2555; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
2556; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
2557; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
2558; GFX10-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
2559; GFX10-NEXT:    ; implicit-def: $vgpr4
2560; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2561; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
2562; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
2563; GFX10-NEXT:  ; %bb.2:
2564; GFX10-NEXT:    s_mov_b32 exec_lo, s6
2565; GFX10-NEXT:  .LBB10_3: ; %atomicrmw.start
2566; GFX10-NEXT:    ; =>This Loop Header: Depth=1
2567; GFX10-NEXT:    ; Child Loop BB10_4 Depth 2
2568; GFX10-NEXT:    s_waitcnt vmcnt(0)
2569; GFX10-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
2570; GFX10-NEXT:    s_mov_b32 s6, exec_lo
2571; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2572; GFX10-NEXT:    v_mov_b32_e32 v0, v11
2573; GFX10-NEXT:    v_mov_b32_e32 v1, v12
2574; GFX10-NEXT:    v_mov_b32_e32 v2, v13
2575; GFX10-NEXT:    v_mov_b32_e32 v3, v14
2576; GFX10-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
2577; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
2578; GFX10-NEXT:    v_readfirstlane_b32 s8, v9
2579; GFX10-NEXT:    v_readfirstlane_b32 s9, v10
2580; GFX10-NEXT:    v_readfirstlane_b32 s10, v7
2581; GFX10-NEXT:    v_readfirstlane_b32 s11, v8
2582; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
2583; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
2584; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
2585; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
2586; GFX10-NEXT:    s_waitcnt vmcnt(0)
2587; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
2588; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2589; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
2590; GFX10-NEXT:    s_cbranch_execnz .LBB10_4
2591; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
2592; GFX10-NEXT:    s_mov_b32 exec_lo, s6
2593; GFX10-NEXT:    s_waitcnt vmcnt(0)
2594; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
2595; GFX10-NEXT:    v_mov_b32_e32 v14, v1
2596; GFX10-NEXT:    v_mov_b32_e32 v13, v0
2597; GFX10-NEXT:    buffer_gl1_inv
2598; GFX10-NEXT:    buffer_gl0_inv
2599; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
2600; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2601; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
2602; GFX10-NEXT:    s_cbranch_execnz .LBB10_3
2603; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
2604; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
2605; GFX10-NEXT:    s_setpc_b64 s[30:31]
2606;
2607; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
2608; GFX90A:       ; %bb.0:
2609; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2610; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
2611; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
2612; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
2613; GFX90A-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
2614; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
2615; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
2616; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
2617; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
2618; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
2619; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
2620; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
2621; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
2622; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2623; GFX90A-NEXT:    buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
2624; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2625; GFX90A-NEXT:    ; implicit-def: $vgpr4
2626; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
2627; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
2628; GFX90A-NEXT:  ; %bb.2:
2629; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
2630; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2631; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
2632; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
2633; GFX90A-NEXT:    buffer_wbinvl1
2634; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2635;
2636; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
2637; GFX908:       ; %bb.0:
2638; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2639; GFX908-NEXT:    v_mov_b32_e32 v8, v3
2640; GFX908-NEXT:    v_mov_b32_e32 v7, v2
2641; GFX908-NEXT:    v_mov_b32_e32 v10, v1
2642; GFX908-NEXT:    v_mov_b32_e32 v9, v0
2643; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
2644; GFX908-NEXT:    s_mov_b64 s[6:7], exec
2645; GFX908-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
2646; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
2647; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
2648; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
2649; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
2650; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
2651; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
2652; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
2653; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
2654; GFX908-NEXT:    s_nop 0
2655; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
2656; GFX908-NEXT:    ; implicit-def: $vgpr4
2657; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
2658; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
2659; GFX908-NEXT:  ; %bb.2:
2660; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
2661; GFX908-NEXT:    s_mov_b64 s[6:7], 0
2662; GFX908-NEXT:  .LBB10_3: ; %atomicrmw.start
2663; GFX908-NEXT:    ; =>This Loop Header: Depth=1
2664; GFX908-NEXT:    ; Child Loop BB10_4 Depth 2
2665; GFX908-NEXT:    s_waitcnt vmcnt(0)
2666; GFX908-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
2667; GFX908-NEXT:    s_mov_b64 s[12:13], exec
2668; GFX908-NEXT:    v_mov_b32_e32 v0, v11
2669; GFX908-NEXT:    v_mov_b32_e32 v1, v12
2670; GFX908-NEXT:    v_mov_b32_e32 v2, v13
2671; GFX908-NEXT:    v_mov_b32_e32 v3, v14
2672; GFX908-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
2673; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
2674; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
2675; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
2676; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
2677; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
2678; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
2679; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
2680; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
2681; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
2682; GFX908-NEXT:    s_waitcnt vmcnt(0)
2683; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
2684; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
2685; GFX908-NEXT:    s_cbranch_execnz .LBB10_4
2686; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
2687; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
2688; GFX908-NEXT:    s_waitcnt vmcnt(0)
2689; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
2690; GFX908-NEXT:    v_mov_b32_e32 v14, v1
2691; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
2692; GFX908-NEXT:    v_mov_b32_e32 v13, v0
2693; GFX908-NEXT:    buffer_wbinvl1
2694; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
2695; GFX908-NEXT:    s_cbranch_execnz .LBB10_3
2696; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
2697; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
2698; GFX908-NEXT:    s_setpc_b64 s[30:31]
2699;
2700; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
2701; GFX8:       ; %bb.0:
2702; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2703; GFX8-NEXT:    v_mov_b32_e32 v8, v3
2704; GFX8-NEXT:    v_mov_b32_e32 v7, v2
2705; GFX8-NEXT:    v_mov_b32_e32 v10, v1
2706; GFX8-NEXT:    v_mov_b32_e32 v9, v0
2707; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
2708; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2709; GFX8-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
2710; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
2711; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
2712; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
2713; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
2714; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
2715; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
2716; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
2717; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
2718; GFX8-NEXT:    s_nop 0
2719; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
2720; GFX8-NEXT:    ; implicit-def: $vgpr4
2721; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
2722; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
2723; GFX8-NEXT:  ; %bb.2:
2724; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
2725; GFX8-NEXT:    s_mov_b64 s[6:7], 0
2726; GFX8-NEXT:  .LBB10_3: ; %atomicrmw.start
2727; GFX8-NEXT:    ; =>This Loop Header: Depth=1
2728; GFX8-NEXT:    ; Child Loop BB10_4 Depth 2
2729; GFX8-NEXT:    s_waitcnt vmcnt(0)
2730; GFX8-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
2731; GFX8-NEXT:    s_mov_b64 s[12:13], exec
2732; GFX8-NEXT:    v_mov_b32_e32 v0, v11
2733; GFX8-NEXT:    v_mov_b32_e32 v1, v12
2734; GFX8-NEXT:    v_mov_b32_e32 v2, v13
2735; GFX8-NEXT:    v_mov_b32_e32 v3, v14
2736; GFX8-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
2737; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
2738; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
2739; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
2740; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
2741; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
2742; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
2743; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
2744; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
2745; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
2746; GFX8-NEXT:    s_waitcnt vmcnt(0)
2747; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
2748; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
2749; GFX8-NEXT:    s_cbranch_execnz .LBB10_4
2750; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
2751; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
2752; GFX8-NEXT:    s_waitcnt vmcnt(0)
2753; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
2754; GFX8-NEXT:    v_mov_b32_e32 v14, v1
2755; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
2756; GFX8-NEXT:    v_mov_b32_e32 v13, v0
2757; GFX8-NEXT:    buffer_wbinvl1
2758; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
2759; GFX8-NEXT:    s_cbranch_execnz .LBB10_3
2760; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
2761; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
2762; GFX8-NEXT:    s_setpc_b64 s[30:31]
2763;
2764; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
2765; GFX7:       ; %bb.0:
2766; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2767; GFX7-NEXT:    v_mov_b32_e32 v8, v3
2768; GFX7-NEXT:    v_mov_b32_e32 v7, v2
2769; GFX7-NEXT:    v_mov_b32_e32 v10, v1
2770; GFX7-NEXT:    v_mov_b32_e32 v9, v0
2771; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x800, v4
2772; GFX7-NEXT:    s_mov_b64 s[6:7], exec
2773; GFX7-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
2774; GFX7-NEXT:    v_readfirstlane_b32 s8, v9
2775; GFX7-NEXT:    v_readfirstlane_b32 s9, v10
2776; GFX7-NEXT:    v_readfirstlane_b32 s10, v7
2777; GFX7-NEXT:    v_readfirstlane_b32 s11, v8
2778; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
2779; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
2780; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
2781; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
2782; GFX7-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
2783; GFX7-NEXT:    ; implicit-def: $vgpr4
2784; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
2785; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
2786; GFX7-NEXT:  ; %bb.2:
2787; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
2788; GFX7-NEXT:    s_mov_b64 s[6:7], 0
2789; GFX7-NEXT:  .LBB10_3: ; %atomicrmw.start
2790; GFX7-NEXT:    ; =>This Loop Header: Depth=1
2791; GFX7-NEXT:    ; Child Loop BB10_4 Depth 2
2792; GFX7-NEXT:    s_waitcnt vmcnt(0)
2793; GFX7-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
2794; GFX7-NEXT:    s_mov_b64 s[12:13], exec
2795; GFX7-NEXT:    v_mov_b32_e32 v0, v11
2796; GFX7-NEXT:    v_mov_b32_e32 v1, v12
2797; GFX7-NEXT:    v_mov_b32_e32 v2, v13
2798; GFX7-NEXT:    v_mov_b32_e32 v3, v14
2799; GFX7-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
2800; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
2801; GFX7-NEXT:    v_readfirstlane_b32 s8, v9
2802; GFX7-NEXT:    v_readfirstlane_b32 s9, v10
2803; GFX7-NEXT:    v_readfirstlane_b32 s10, v7
2804; GFX7-NEXT:    v_readfirstlane_b32 s11, v8
2805; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
2806; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
2807; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
2808; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
2809; GFX7-NEXT:    s_waitcnt vmcnt(0)
2810; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
2811; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
2812; GFX7-NEXT:    s_cbranch_execnz .LBB10_4
2813; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
2814; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
2815; GFX7-NEXT:    s_waitcnt vmcnt(0)
2816; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
2817; GFX7-NEXT:    v_mov_b32_e32 v14, v1
2818; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
2819; GFX7-NEXT:    v_mov_b32_e32 v13, v0
2820; GFX7-NEXT:    buffer_wbinvl1
2821; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
2822; GFX7-NEXT:    s_cbranch_execnz .LBB10_3
2823; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
2824; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
2825; GFX7-NEXT:    s_setpc_b64 s[30:31]
2826;
2827; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
2828; GFX6:       ; %bb.0:
2829; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2830; GFX6-NEXT:    v_mov_b32_e32 v8, v3
2831; GFX6-NEXT:    v_mov_b32_e32 v7, v2
2832; GFX6-NEXT:    v_mov_b32_e32 v10, v1
2833; GFX6-NEXT:    v_mov_b32_e32 v9, v0
2834; GFX6-NEXT:    v_add_i32_e32 v15, vcc, 0x800, v4
2835; GFX6-NEXT:    s_mov_b64 s[6:7], exec
2836; GFX6-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
2837; GFX6-NEXT:    v_readfirstlane_b32 s8, v9
2838; GFX6-NEXT:    v_readfirstlane_b32 s9, v10
2839; GFX6-NEXT:    v_readfirstlane_b32 s10, v7
2840; GFX6-NEXT:    v_readfirstlane_b32 s11, v8
2841; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
2842; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
2843; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
2844; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
2845; GFX6-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
2846; GFX6-NEXT:    ; implicit-def: $vgpr4
2847; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
2848; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
2849; GFX6-NEXT:  ; %bb.2:
2850; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
2851; GFX6-NEXT:    s_mov_b64 s[6:7], 0
2852; GFX6-NEXT:  .LBB10_3: ; %atomicrmw.start
2853; GFX6-NEXT:    ; =>This Loop Header: Depth=1
2854; GFX6-NEXT:    ; Child Loop BB10_4 Depth 2
2855; GFX6-NEXT:    s_waitcnt vmcnt(0)
2856; GFX6-NEXT:    v_add_f64 v[11:12], v[13:14], v[5:6]
2857; GFX6-NEXT:    s_mov_b64 s[12:13], exec
2858; GFX6-NEXT:    s_waitcnt expcnt(0)
2859; GFX6-NEXT:    v_mov_b32_e32 v0, v11
2860; GFX6-NEXT:    v_mov_b32_e32 v1, v12
2861; GFX6-NEXT:    v_mov_b32_e32 v2, v13
2862; GFX6-NEXT:    v_mov_b32_e32 v3, v14
2863; GFX6-NEXT:  .LBB10_4: ; Parent Loop BB10_3 Depth=1
2864; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
2865; GFX6-NEXT:    v_readfirstlane_b32 s8, v9
2866; GFX6-NEXT:    v_readfirstlane_b32 s9, v10
2867; GFX6-NEXT:    v_readfirstlane_b32 s10, v7
2868; GFX6-NEXT:    v_readfirstlane_b32 s11, v8
2869; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
2870; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
2871; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
2872; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
2873; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2874; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
2875; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
2876; GFX6-NEXT:    s_cbranch_execnz .LBB10_4
2877; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
2878; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
2879; GFX6-NEXT:    s_waitcnt vmcnt(0)
2880; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
2881; GFX6-NEXT:    v_mov_b32_e32 v14, v1
2882; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
2883; GFX6-NEXT:    v_mov_b32_e32 v13, v0
2884; GFX6-NEXT:    buffer_wbinvl1
2885; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
2886; GFX6-NEXT:    s_cbranch_execnz .LBB10_3
2887; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
2888; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
2889; GFX6-NEXT:    s_waitcnt expcnt(0)
2890; GFX6-NEXT:    s_setpc_b64 s[30:31]
2891  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
2892  %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2893  ret double %result
2894}
2895
2896define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
2897; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
2898; GFX12:       ; %bb.0:
2899; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2900; GFX12-NEXT:    s_wait_expcnt 0x0
2901; GFX12-NEXT:    s_wait_samplecnt 0x0
2902; GFX12-NEXT:    s_wait_bvhcnt 0x0
2903; GFX12-NEXT:    s_wait_kmcnt 0x0
2904; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
2905; GFX12-NEXT:    v_mov_b32_e32 v0, s16
2906; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
2907; GFX12-NEXT:    s_wait_alu 0xfffe
2908; GFX12-NEXT:    v_mov_b32_e32 v6, s4
2909; GFX12-NEXT:    s_mov_b32 s4, 0
2910; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
2911; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
2912; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2913; GFX12-NEXT:    s_wait_loadcnt 0x0
2914; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
2915; GFX12-NEXT:    s_wait_storecnt 0x0
2916; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2917; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
2918; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
2919; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
2920; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
2921; GFX12-NEXT:    s_wait_loadcnt 0x0
2922; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2923; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
2924; GFX12-NEXT:    s_wait_alu 0xfffe
2925; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
2926; GFX12-NEXT:    s_wait_alu 0xfffe
2927; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
2928; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
2929; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2930; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2931; GFX12-NEXT:    s_wait_alu 0xfffe
2932; GFX12-NEXT:    s_setpc_b64 s[30:31]
2933;
2934; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
2935; GFX940:       ; %bb.0:
2936; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2937; GFX940-NEXT:    v_mov_b32_e32 v2, s16
2938; GFX940-NEXT:    buffer_wbl2 sc1
2939; GFX940-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
2940; GFX940-NEXT:    s_waitcnt vmcnt(0)
2941; GFX940-NEXT:    buffer_inv sc1
2942; GFX940-NEXT:    s_setpc_b64 s[30:31]
2943;
2944; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
2945; GFX11:       ; %bb.0:
2946; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2947; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
2948; GFX11-NEXT:    v_mov_b32_e32 v0, s16
2949; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
2950; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2951; GFX11-NEXT:    v_mov_b32_e32 v6, s4
2952; GFX11-NEXT:    s_mov_b32 s4, 0
2953; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
2954; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
2955; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2956; GFX11-NEXT:    s_waitcnt vmcnt(0)
2957; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
2958; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2959; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2960; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
2961; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
2962; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
2963; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
2964; GFX11-NEXT:    s_waitcnt vmcnt(0)
2965; GFX11-NEXT:    buffer_gl1_inv
2966; GFX11-NEXT:    buffer_gl0_inv
2967; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
2968; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
2969; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2970; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
2971; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
2972; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2973; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2974; GFX11-NEXT:    s_setpc_b64 s[30:31]
2975;
2976; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
2977; GFX10:       ; %bb.0:
2978; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2979; GFX10-NEXT:    v_mov_b32_e32 v4, v0
2980; GFX10-NEXT:    v_mov_b32_e32 v0, s20
2981; GFX10-NEXT:    v_mov_b32_e32 v5, v1
2982; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
2983; GFX10-NEXT:    v_mov_b32_e32 v6, s4
2984; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
2985; GFX10-NEXT:    s_mov_b32 s4, 0
2986; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
2987; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2988; GFX10-NEXT:    s_waitcnt vmcnt(0)
2989; GFX10-NEXT:    v_mov_b32_e32 v10, v1
2990; GFX10-NEXT:    v_mov_b32_e32 v9, v0
2991; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2992; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
2993; GFX10-NEXT:    v_mov_b32_e32 v0, v7
2994; GFX10-NEXT:    v_mov_b32_e32 v1, v8
2995; GFX10-NEXT:    v_mov_b32_e32 v2, v9
2996; GFX10-NEXT:    v_mov_b32_e32 v3, v10
2997; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
2998; GFX10-NEXT:    s_waitcnt vmcnt(0)
2999; GFX10-NEXT:    buffer_gl1_inv
3000; GFX10-NEXT:    buffer_gl0_inv
3001; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
3002; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
3003; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
3004; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
3005; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3006; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3007; GFX10-NEXT:    s_setpc_b64 s[30:31]
3008;
3009; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
3010; GFX90A:       ; %bb.0:
3011; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3012; GFX90A-NEXT:    v_mov_b32_e32 v4, v0
3013; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
3014; GFX90A-NEXT:    v_mov_b32_e32 v5, v1
3015; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
3016; GFX90A-NEXT:    s_add_i32 s6, s20, 0x800
3017; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
3018; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
3019; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
3020; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
3021; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3022; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
3023; GFX90A-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
3024; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
3025; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
3026; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
3027; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3028; GFX90A-NEXT:    buffer_wbinvl1
3029; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
3030; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3031; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3032; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
3033; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
3034; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3035; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3036;
3037; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
3038; GFX908:       ; %bb.0:
3039; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3040; GFX908-NEXT:    v_mov_b32_e32 v4, v0
3041; GFX908-NEXT:    v_mov_b32_e32 v0, s20
3042; GFX908-NEXT:    v_mov_b32_e32 v5, v1
3043; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
3044; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
3045; GFX908-NEXT:    s_mov_b64 s[4:5], 0
3046; GFX908-NEXT:    v_mov_b32_e32 v6, s6
3047; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
3048; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
3049; GFX908-NEXT:    s_waitcnt vmcnt(0)
3050; GFX908-NEXT:    v_mov_b32_e32 v10, v1
3051; GFX908-NEXT:    v_mov_b32_e32 v9, v0
3052; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
3053; GFX908-NEXT:    v_mov_b32_e32 v0, v7
3054; GFX908-NEXT:    v_mov_b32_e32 v1, v8
3055; GFX908-NEXT:    v_mov_b32_e32 v2, v9
3056; GFX908-NEXT:    v_mov_b32_e32 v3, v10
3057; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
3058; GFX908-NEXT:    s_waitcnt vmcnt(0)
3059; GFX908-NEXT:    buffer_wbinvl1
3060; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
3061; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3062; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3063; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
3064; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
3065; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3066; GFX908-NEXT:    s_setpc_b64 s[30:31]
3067;
3068; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
3069; GFX8:       ; %bb.0:
3070; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3071; GFX8-NEXT:    v_mov_b32_e32 v4, v0
3072; GFX8-NEXT:    v_mov_b32_e32 v0, s20
3073; GFX8-NEXT:    v_mov_b32_e32 v5, v1
3074; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
3075; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
3076; GFX8-NEXT:    s_mov_b64 s[4:5], 0
3077; GFX8-NEXT:    v_mov_b32_e32 v6, s6
3078; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
3079; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3080; GFX8-NEXT:    s_waitcnt vmcnt(0)
3081; GFX8-NEXT:    v_mov_b32_e32 v10, v1
3082; GFX8-NEXT:    v_mov_b32_e32 v9, v0
3083; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
3084; GFX8-NEXT:    v_mov_b32_e32 v0, v7
3085; GFX8-NEXT:    v_mov_b32_e32 v1, v8
3086; GFX8-NEXT:    v_mov_b32_e32 v2, v9
3087; GFX8-NEXT:    v_mov_b32_e32 v3, v10
3088; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
3089; GFX8-NEXT:    s_waitcnt vmcnt(0)
3090; GFX8-NEXT:    buffer_wbinvl1
3091; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
3092; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3093; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3094; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
3095; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
3096; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3097; GFX8-NEXT:    s_setpc_b64 s[30:31]
3098;
3099; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
3100; GFX7:       ; %bb.0:
3101; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3102; GFX7-NEXT:    v_mov_b32_e32 v4, v0
3103; GFX7-NEXT:    v_mov_b32_e32 v0, s20
3104; GFX7-NEXT:    v_mov_b32_e32 v5, v1
3105; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
3106; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
3107; GFX7-NEXT:    s_mov_b64 s[4:5], 0
3108; GFX7-NEXT:    v_mov_b32_e32 v6, s6
3109; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
3110; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
3111; GFX7-NEXT:    s_waitcnt vmcnt(0)
3112; GFX7-NEXT:    v_mov_b32_e32 v10, v1
3113; GFX7-NEXT:    v_mov_b32_e32 v9, v0
3114; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
3115; GFX7-NEXT:    v_mov_b32_e32 v0, v7
3116; GFX7-NEXT:    v_mov_b32_e32 v1, v8
3117; GFX7-NEXT:    v_mov_b32_e32 v2, v9
3118; GFX7-NEXT:    v_mov_b32_e32 v3, v10
3119; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
3120; GFX7-NEXT:    s_waitcnt vmcnt(0)
3121; GFX7-NEXT:    buffer_wbinvl1
3122; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
3123; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3124; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3125; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
3126; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
3127; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3128; GFX7-NEXT:    s_setpc_b64 s[30:31]
3129;
3130; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
3131; GFX6:       ; %bb.0:
3132; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3133; GFX6-NEXT:    v_mov_b32_e32 v4, v0
3134; GFX6-NEXT:    v_mov_b32_e32 v0, s20
3135; GFX6-NEXT:    v_mov_b32_e32 v5, v1
3136; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
3137; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
3138; GFX6-NEXT:    s_mov_b64 s[4:5], 0
3139; GFX6-NEXT:    v_mov_b32_e32 v6, s6
3140; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
3141; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
3142; GFX6-NEXT:    s_waitcnt vmcnt(0)
3143; GFX6-NEXT:    v_mov_b32_e32 v10, v1
3144; GFX6-NEXT:    v_mov_b32_e32 v9, v0
3145; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
3146; GFX6-NEXT:    s_waitcnt expcnt(0)
3147; GFX6-NEXT:    v_mov_b32_e32 v0, v7
3148; GFX6-NEXT:    v_mov_b32_e32 v1, v8
3149; GFX6-NEXT:    v_mov_b32_e32 v2, v9
3150; GFX6-NEXT:    v_mov_b32_e32 v3, v10
3151; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
3152; GFX6-NEXT:    s_waitcnt vmcnt(0)
3153; GFX6-NEXT:    buffer_wbinvl1
3154; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
3155; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3156; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3157; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
3158; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
3159; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
3160; GFX6-NEXT:    s_waitcnt expcnt(0)
3161; GFX6-NEXT:    s_setpc_b64 s[30:31]
3162  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
3163  %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
3164  ret double %result
3165}
3166
3167define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
3168; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
3169; GFX12:       ; %bb.0:
3170; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3171; GFX12-NEXT:    s_wait_expcnt 0x0
3172; GFX12-NEXT:    s_wait_samplecnt 0x0
3173; GFX12-NEXT:    s_wait_bvhcnt 0x0
3174; GFX12-NEXT:    s_wait_kmcnt 0x0
3175; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
3176; GFX12-NEXT:    v_mov_b32_e32 v0, s16
3177; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
3178; GFX12-NEXT:    s_wait_alu 0xfffe
3179; GFX12-NEXT:    v_mov_b32_e32 v6, s4
3180; GFX12-NEXT:    s_mov_b32 s4, 0
3181; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
3182; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
3183; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3184; GFX12-NEXT:    s_wait_loadcnt 0x0
3185; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
3186; GFX12-NEXT:    s_wait_storecnt 0x0
3187; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3188; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
3189; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
3190; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
3191; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
3192; GFX12-NEXT:    s_wait_loadcnt 0x0
3193; GFX12-NEXT:    global_inv scope:SCOPE_DEV
3194; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
3195; GFX12-NEXT:    s_wait_alu 0xfffe
3196; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
3197; GFX12-NEXT:    s_wait_alu 0xfffe
3198; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
3199; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
3200; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
3201; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3202; GFX12-NEXT:    s_wait_alu 0xfffe
3203; GFX12-NEXT:    s_setpc_b64 s[30:31]
3204;
3205; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
3206; GFX940:       ; %bb.0:
3207; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3208; GFX940-NEXT:    v_mov_b32_e32 v2, s16
3209; GFX940-NEXT:    buffer_wbl2 sc1
3210; GFX940-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
3211; GFX940-NEXT:    s_waitcnt vmcnt(0)
3212; GFX940-NEXT:    buffer_inv sc1
3213; GFX940-NEXT:    s_setpc_b64 s[30:31]
3214;
3215; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
3216; GFX11:       ; %bb.0:
3217; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3218; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
3219; GFX11-NEXT:    v_mov_b32_e32 v0, s16
3220; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
3221; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3222; GFX11-NEXT:    v_mov_b32_e32 v6, s4
3223; GFX11-NEXT:    s_mov_b32 s4, 0
3224; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
3225; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
3226; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3227; GFX11-NEXT:    s_waitcnt vmcnt(0)
3228; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
3229; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3230; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3231; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
3232; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
3233; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
3234; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
3235; GFX11-NEXT:    s_waitcnt vmcnt(0)
3236; GFX11-NEXT:    buffer_gl1_inv
3237; GFX11-NEXT:    buffer_gl0_inv
3238; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
3239; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
3240; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3241; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
3242; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
3243; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
3244; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3245; GFX11-NEXT:    s_setpc_b64 s[30:31]
3246;
3247; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
3248; GFX10:       ; %bb.0:
3249; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3250; GFX10-NEXT:    v_mov_b32_e32 v4, v0
3251; GFX10-NEXT:    v_mov_b32_e32 v0, s20
3252; GFX10-NEXT:    v_mov_b32_e32 v5, v1
3253; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
3254; GFX10-NEXT:    v_mov_b32_e32 v6, s4
3255; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
3256; GFX10-NEXT:    s_mov_b32 s4, 0
3257; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
3258; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3259; GFX10-NEXT:    s_waitcnt vmcnt(0)
3260; GFX10-NEXT:    v_mov_b32_e32 v10, v1
3261; GFX10-NEXT:    v_mov_b32_e32 v9, v0
3262; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3263; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
3264; GFX10-NEXT:    v_mov_b32_e32 v0, v7
3265; GFX10-NEXT:    v_mov_b32_e32 v1, v8
3266; GFX10-NEXT:    v_mov_b32_e32 v2, v9
3267; GFX10-NEXT:    v_mov_b32_e32 v3, v10
3268; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
3269; GFX10-NEXT:    s_waitcnt vmcnt(0)
3270; GFX10-NEXT:    buffer_gl1_inv
3271; GFX10-NEXT:    buffer_gl0_inv
3272; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
3273; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
3274; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
3275; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
3276; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3277; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3278; GFX10-NEXT:    s_setpc_b64 s[30:31]
3279;
3280; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
3281; GFX90A:       ; %bb.0:
3282; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3283; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
3284; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc
3285; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3286; GFX90A-NEXT:    buffer_wbinvl1
3287; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3288;
3289; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
3290; GFX908:       ; %bb.0:
3291; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3292; GFX908-NEXT:    v_mov_b32_e32 v4, v0
3293; GFX908-NEXT:    v_mov_b32_e32 v0, s20
3294; GFX908-NEXT:    v_mov_b32_e32 v5, v1
3295; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
3296; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
3297; GFX908-NEXT:    s_mov_b64 s[4:5], 0
3298; GFX908-NEXT:    v_mov_b32_e32 v6, s6
3299; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
3300; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
3301; GFX908-NEXT:    s_waitcnt vmcnt(0)
3302; GFX908-NEXT:    v_mov_b32_e32 v10, v1
3303; GFX908-NEXT:    v_mov_b32_e32 v9, v0
3304; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
3305; GFX908-NEXT:    v_mov_b32_e32 v0, v7
3306; GFX908-NEXT:    v_mov_b32_e32 v1, v8
3307; GFX908-NEXT:    v_mov_b32_e32 v2, v9
3308; GFX908-NEXT:    v_mov_b32_e32 v3, v10
3309; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
3310; GFX908-NEXT:    s_waitcnt vmcnt(0)
3311; GFX908-NEXT:    buffer_wbinvl1
3312; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
3313; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3314; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3315; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
3316; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
3317; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3318; GFX908-NEXT:    s_setpc_b64 s[30:31]
3319;
3320; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
3321; GFX8:       ; %bb.0:
3322; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3323; GFX8-NEXT:    v_mov_b32_e32 v4, v0
3324; GFX8-NEXT:    v_mov_b32_e32 v0, s20
3325; GFX8-NEXT:    v_mov_b32_e32 v5, v1
3326; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
3327; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
3328; GFX8-NEXT:    s_mov_b64 s[4:5], 0
3329; GFX8-NEXT:    v_mov_b32_e32 v6, s6
3330; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
3331; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3332; GFX8-NEXT:    s_waitcnt vmcnt(0)
3333; GFX8-NEXT:    v_mov_b32_e32 v10, v1
3334; GFX8-NEXT:    v_mov_b32_e32 v9, v0
3335; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
3336; GFX8-NEXT:    v_mov_b32_e32 v0, v7
3337; GFX8-NEXT:    v_mov_b32_e32 v1, v8
3338; GFX8-NEXT:    v_mov_b32_e32 v2, v9
3339; GFX8-NEXT:    v_mov_b32_e32 v3, v10
3340; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
3341; GFX8-NEXT:    s_waitcnt vmcnt(0)
3342; GFX8-NEXT:    buffer_wbinvl1
3343; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
3344; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3345; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3346; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
3347; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
3348; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3349; GFX8-NEXT:    s_setpc_b64 s[30:31]
3350;
3351; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
3352; GFX7:       ; %bb.0:
3353; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3354; GFX7-NEXT:    v_mov_b32_e32 v4, v0
3355; GFX7-NEXT:    v_mov_b32_e32 v0, s20
3356; GFX7-NEXT:    v_mov_b32_e32 v5, v1
3357; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
3358; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
3359; GFX7-NEXT:    s_mov_b64 s[4:5], 0
3360; GFX7-NEXT:    v_mov_b32_e32 v6, s6
3361; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
3362; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
3363; GFX7-NEXT:    s_waitcnt vmcnt(0)
3364; GFX7-NEXT:    v_mov_b32_e32 v10, v1
3365; GFX7-NEXT:    v_mov_b32_e32 v9, v0
3366; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
3367; GFX7-NEXT:    v_mov_b32_e32 v0, v7
3368; GFX7-NEXT:    v_mov_b32_e32 v1, v8
3369; GFX7-NEXT:    v_mov_b32_e32 v2, v9
3370; GFX7-NEXT:    v_mov_b32_e32 v3, v10
3371; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
3372; GFX7-NEXT:    s_waitcnt vmcnt(0)
3373; GFX7-NEXT:    buffer_wbinvl1
3374; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
3375; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3376; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3377; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
3378; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
3379; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3380; GFX7-NEXT:    s_setpc_b64 s[30:31]
3381;
3382; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
3383; GFX6:       ; %bb.0:
3384; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3385; GFX6-NEXT:    v_mov_b32_e32 v4, v0
3386; GFX6-NEXT:    v_mov_b32_e32 v0, s20
3387; GFX6-NEXT:    v_mov_b32_e32 v5, v1
3388; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
3389; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
3390; GFX6-NEXT:    s_mov_b64 s[4:5], 0
3391; GFX6-NEXT:    v_mov_b32_e32 v6, s6
3392; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
3393; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
3394; GFX6-NEXT:    s_waitcnt vmcnt(0)
3395; GFX6-NEXT:    v_mov_b32_e32 v10, v1
3396; GFX6-NEXT:    v_mov_b32_e32 v9, v0
3397; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
3398; GFX6-NEXT:    s_waitcnt expcnt(0)
3399; GFX6-NEXT:    v_mov_b32_e32 v0, v7
3400; GFX6-NEXT:    v_mov_b32_e32 v1, v8
3401; GFX6-NEXT:    v_mov_b32_e32 v2, v9
3402; GFX6-NEXT:    v_mov_b32_e32 v3, v10
3403; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
3404; GFX6-NEXT:    s_waitcnt vmcnt(0)
3405; GFX6-NEXT:    buffer_wbinvl1
3406; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
3407; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3408; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3409; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
3410; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
3411; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
3412; GFX6-NEXT:    s_waitcnt expcnt(0)
3413; GFX6-NEXT:    s_setpc_b64 s[30:31]
3414  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
3415  %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
3416  ret double %result
3417}
3418
3419; --------------------------------------------------------------------
3420; half
3421; --------------------------------------------------------------------
3422
3423define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
3424; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
3425; GFX12:       ; %bb.0:
3426; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3427; GFX12-NEXT:    s_wait_expcnt 0x0
3428; GFX12-NEXT:    s_wait_samplecnt 0x0
3429; GFX12-NEXT:    s_wait_bvhcnt 0x0
3430; GFX12-NEXT:    s_wait_kmcnt 0x0
3431; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
3432; GFX12-NEXT:    s_wait_alu 0xfffe
3433; GFX12-NEXT:    s_and_b32 s4, s16, -4
3434; GFX12-NEXT:    s_wait_alu 0xfffe
3435; GFX12-NEXT:    v_mov_b32_e32 v5, s4
3436; GFX12-NEXT:    s_and_b32 s4, s16, 3
3437; GFX12-NEXT:    s_wait_alu 0xfffe
3438; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
3439; GFX12-NEXT:    s_wait_alu 0xfffe
3440; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
3441; GFX12-NEXT:    buffer_load_b32 v2, v5, s[0:3], null offen
3442; GFX12-NEXT:    s_wait_alu 0xfffe
3443; GFX12-NEXT:    s_not_b32 s6, s5
3444; GFX12-NEXT:    s_mov_b32 s5, 0
3445; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
3446; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3447; GFX12-NEXT:    s_wait_loadcnt 0x0
3448; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
3449; GFX12-NEXT:    s_wait_storecnt 0x0
3450; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3451; GFX12-NEXT:    v_add_f16_e32 v1, v1, v0
3452; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3453; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3454; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
3455; GFX12-NEXT:    s_wait_alu 0xfffe
3456; GFX12-NEXT:    v_and_or_b32 v1, v2, s6, v1
3457; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3458; GFX12-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
3459; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
3460; GFX12-NEXT:    s_wait_loadcnt 0x0
3461; GFX12-NEXT:    global_inv scope:SCOPE_DEV
3462; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
3463; GFX12-NEXT:    v_mov_b32_e32 v2, v3
3464; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
3465; GFX12-NEXT:    s_wait_alu 0xfffe
3466; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
3467; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
3468; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
3469; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
3470; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
3471; GFX12-NEXT:    s_wait_alu 0xfffe
3472; GFX12-NEXT:    s_setpc_b64 s[30:31]
3473;
3474; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
3475; GFX940:       ; %bb.0:
3476; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3477; GFX940-NEXT:    s_addk_i32 s16, 0x200
3478; GFX940-NEXT:    s_and_b32 s4, s16, -4
3479; GFX940-NEXT:    v_mov_b32_e32 v1, s4
3480; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
3481; GFX940-NEXT:    s_and_b32 s4, s16, 3
3482; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
3483; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
3484; GFX940-NEXT:    s_not_b32 s7, s4
3485; GFX940-NEXT:    s_mov_b64 s[4:5], 0
3486; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
3487; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
3488; GFX940-NEXT:    s_waitcnt vmcnt(0)
3489; GFX940-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
3490; GFX940-NEXT:    v_add_f16_e32 v2, v2, v0
3491; GFX940-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
3492; GFX940-NEXT:    v_and_or_b32 v2, v3, s7, v2
3493; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
3494; GFX940-NEXT:    buffer_wbl2 sc1
3495; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
3496; GFX940-NEXT:    s_waitcnt vmcnt(0)
3497; GFX940-NEXT:    buffer_inv sc1
3498; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
3499; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3500; GFX940-NEXT:    v_mov_b32_e32 v3, v4
3501; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3502; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
3503; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
3504; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
3505; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v4
3506; GFX940-NEXT:    s_setpc_b64 s[30:31]
3507;
3508; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
3509; GFX11:       ; %bb.0:
3510; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3511; GFX11-NEXT:    s_addk_i32 s16, 0x200
3512; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3513; GFX11-NEXT:    s_and_b32 s4, s16, -4
3514; GFX11-NEXT:    v_mov_b32_e32 v5, s4
3515; GFX11-NEXT:    s_and_b32 s4, s16, 3
3516; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3517; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
3518; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
3519; GFX11-NEXT:    buffer_load_b32 v2, v5, s[0:3], 0 offen
3520; GFX11-NEXT:    s_not_b32 s6, s5
3521; GFX11-NEXT:    s_mov_b32 s5, 0
3522; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
3523; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3524; GFX11-NEXT:    s_waitcnt vmcnt(0)
3525; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
3526; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3527; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3528; GFX11-NEXT:    v_add_f16_e32 v1, v1, v0
3529; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3530; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3531; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
3532; GFX11-NEXT:    v_and_or_b32 v1, v2, s6, v1
3533; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3534; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
3535; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
3536; GFX11-NEXT:    s_waitcnt vmcnt(0)
3537; GFX11-NEXT:    buffer_gl1_inv
3538; GFX11-NEXT:    buffer_gl0_inv
3539; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
3540; GFX11-NEXT:    v_mov_b32_e32 v2, v3
3541; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
3542; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3543; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
3544; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
3545; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
3546; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
3547; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
3548; GFX11-NEXT:    s_setpc_b64 s[30:31]
3549;
3550; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
3551; GFX10:       ; %bb.0:
3552; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3553; GFX10-NEXT:    s_addk_i32 s20, 0x200
3554; GFX10-NEXT:    s_and_b32 s4, s20, -4
3555; GFX10-NEXT:    v_mov_b32_e32 v5, s4
3556; GFX10-NEXT:    s_and_b32 s4, s20, 3
3557; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
3558; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
3559; GFX10-NEXT:    buffer_load_dword v2, v5, s[16:19], 0 offen
3560; GFX10-NEXT:    s_not_b32 s6, s5
3561; GFX10-NEXT:    s_mov_b32 s5, 0
3562; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
3563; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3564; GFX10-NEXT:    s_waitcnt vmcnt(0)
3565; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
3566; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3567; GFX10-NEXT:    v_add_f16_e32 v1, v1, v0
3568; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
3569; GFX10-NEXT:    v_and_or_b32 v1, v2, s6, v1
3570; GFX10-NEXT:    v_mov_b32_e32 v4, v2
3571; GFX10-NEXT:    v_mov_b32_e32 v3, v1
3572; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
3573; GFX10-NEXT:    s_waitcnt vmcnt(0)
3574; GFX10-NEXT:    buffer_gl1_inv
3575; GFX10-NEXT:    buffer_gl0_inv
3576; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
3577; GFX10-NEXT:    v_mov_b32_e32 v2, v3
3578; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
3579; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
3580; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
3581; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3582; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
3583; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
3584; GFX10-NEXT:    s_setpc_b64 s[30:31]
3585;
3586; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
3587; GFX90A:       ; %bb.0:
3588; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3589; GFX90A-NEXT:    s_addk_i32 s20, 0x200
3590; GFX90A-NEXT:    s_and_b32 s4, s20, -4
3591; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
3592; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen
3593; GFX90A-NEXT:    s_and_b32 s4, s20, 3
3594; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
3595; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
3596; GFX90A-NEXT:    s_not_b32 s7, s4
3597; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
3598; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
3599; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
3600; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3601; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
3602; GFX90A-NEXT:    v_add_f16_e32 v2, v2, v0
3603; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
3604; GFX90A-NEXT:    v_and_or_b32 v2, v3, s7, v2
3605; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
3606; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
3607; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3608; GFX90A-NEXT:    buffer_wbinvl1
3609; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
3610; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3611; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
3612; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3613; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
3614; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
3615; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3616; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v4
3617; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3618;
3619; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
3620; GFX908:       ; %bb.0:
3621; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3622; GFX908-NEXT:    s_addk_i32 s20, 0x200
3623; GFX908-NEXT:    s_and_b32 s4, s20, -4
3624; GFX908-NEXT:    v_mov_b32_e32 v5, s4
3625; GFX908-NEXT:    buffer_load_dword v2, v5, s[16:19], 0 offen
3626; GFX908-NEXT:    s_and_b32 s4, s20, 3
3627; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
3628; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
3629; GFX908-NEXT:    s_not_b32 s7, s4
3630; GFX908-NEXT:    s_mov_b64 s[4:5], 0
3631; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
3632; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
3633; GFX908-NEXT:    s_waitcnt vmcnt(0)
3634; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s6, v2
3635; GFX908-NEXT:    v_add_f16_e32 v1, v1, v0
3636; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
3637; GFX908-NEXT:    v_and_or_b32 v1, v2, s7, v1
3638; GFX908-NEXT:    v_mov_b32_e32 v4, v2
3639; GFX908-NEXT:    v_mov_b32_e32 v3, v1
3640; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
3641; GFX908-NEXT:    s_waitcnt vmcnt(0)
3642; GFX908-NEXT:    buffer_wbinvl1
3643; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
3644; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3645; GFX908-NEXT:    v_mov_b32_e32 v2, v3
3646; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3647; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
3648; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
3649; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3650; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v3
3651; GFX908-NEXT:    s_setpc_b64 s[30:31]
3652;
3653; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
3654; GFX8:       ; %bb.0:
3655; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3656; GFX8-NEXT:    s_addk_i32 s20, 0x200
3657; GFX8-NEXT:    s_and_b32 s4, s20, -4
3658; GFX8-NEXT:    v_mov_b32_e32 v5, s4
3659; GFX8-NEXT:    buffer_load_dword v2, v5, s[16:19], 0 offen
3660; GFX8-NEXT:    s_and_b32 s4, s20, 3
3661; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
3662; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
3663; GFX8-NEXT:    s_not_b32 s7, s4
3664; GFX8-NEXT:    s_mov_b64 s[4:5], 0
3665; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
3666; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3667; GFX8-NEXT:    s_waitcnt vmcnt(0)
3668; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s6, v2
3669; GFX8-NEXT:    v_add_f16_e32 v1, v1, v0
3670; GFX8-NEXT:    v_and_b32_e32 v3, s7, v2
3671; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
3672; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
3673; GFX8-NEXT:    v_mov_b32_e32 v4, v2
3674; GFX8-NEXT:    v_mov_b32_e32 v3, v1
3675; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
3676; GFX8-NEXT:    s_waitcnt vmcnt(0)
3677; GFX8-NEXT:    buffer_wbinvl1
3678; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
3679; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3680; GFX8-NEXT:    v_mov_b32_e32 v2, v3
3681; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3682; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
3683; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
3684; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3685; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v3
3686; GFX8-NEXT:    s_setpc_b64 s[30:31]
3687;
3688; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
3689; GFX7:       ; %bb.0:
3690; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3691; GFX7-NEXT:    s_addk_i32 s20, 0x200
3692; GFX7-NEXT:    s_and_b32 s4, s20, -4
3693; GFX7-NEXT:    v_mov_b32_e32 v4, s4
3694; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
3695; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
3696; GFX7-NEXT:    s_and_b32 s4, s20, 3
3697; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
3698; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
3699; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
3700; GFX7-NEXT:    s_not_b32 s7, s4
3701; GFX7-NEXT:    s_mov_b64 s[4:5], 0
3702; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
3703; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
3704; GFX7-NEXT:    s_waitcnt vmcnt(0)
3705; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
3706; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
3707; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
3708; GFX7-NEXT:    v_add_f32_e32 v0, v0, v5
3709; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
3710; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
3711; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
3712; GFX7-NEXT:    v_mov_b32_e32 v3, v1
3713; GFX7-NEXT:    v_mov_b32_e32 v2, v0
3714; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
3715; GFX7-NEXT:    s_waitcnt vmcnt(0)
3716; GFX7-NEXT:    buffer_wbinvl1
3717; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
3718; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3719; GFX7-NEXT:    v_mov_b32_e32 v1, v2
3720; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3721; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
3722; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
3723; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3724; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
3725; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
3726; GFX7-NEXT:    s_setpc_b64 s[30:31]
3727;
3728; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
3729; GFX6:       ; %bb.0:
3730; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3731; GFX6-NEXT:    s_addk_i32 s20, 0x200
3732; GFX6-NEXT:    s_and_b32 s4, s20, -4
3733; GFX6-NEXT:    v_mov_b32_e32 v4, s4
3734; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
3735; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
3736; GFX6-NEXT:    s_and_b32 s4, s20, 3
3737; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
3738; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
3739; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
3740; GFX6-NEXT:    s_not_b32 s7, s4
3741; GFX6-NEXT:    s_mov_b64 s[4:5], 0
3742; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
3743; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
3744; GFX6-NEXT:    s_waitcnt vmcnt(0)
3745; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
3746; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
3747; GFX6-NEXT:    s_waitcnt expcnt(0)
3748; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
3749; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
3750; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
3751; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
3752; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
3753; GFX6-NEXT:    v_mov_b32_e32 v3, v1
3754; GFX6-NEXT:    v_mov_b32_e32 v2, v0
3755; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
3756; GFX6-NEXT:    s_waitcnt vmcnt(0)
3757; GFX6-NEXT:    buffer_wbinvl1
3758; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
3759; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3760; GFX6-NEXT:    v_mov_b32_e32 v1, v2
3761; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3762; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
3763; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
3764; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
3765; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
3766; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
3767; GFX6-NEXT:    s_waitcnt expcnt(0)
3768; GFX6-NEXT:    s_setpc_b64 s[30:31]
3769  %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
3770  %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
3771  ret half %result
3772}
3773
3774define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
3775; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
3776; GFX12:       ; %bb.0:
3777; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3778; GFX12-NEXT:    s_wait_expcnt 0x0
3779; GFX12-NEXT:    s_wait_samplecnt 0x0
3780; GFX12-NEXT:    s_wait_bvhcnt 0x0
3781; GFX12-NEXT:    s_wait_kmcnt 0x0
3782; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
3783; GFX12-NEXT:    s_wait_alu 0xfffe
3784; GFX12-NEXT:    s_and_b32 s4, s16, -4
3785; GFX12-NEXT:    s_wait_alu 0xfffe
3786; GFX12-NEXT:    v_mov_b32_e32 v3, s4
3787; GFX12-NEXT:    s_and_b32 s4, s16, 3
3788; GFX12-NEXT:    s_wait_alu 0xfffe
3789; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
3790; GFX12-NEXT:    s_wait_alu 0xfffe
3791; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
3792; GFX12-NEXT:    buffer_load_b32 v2, v3, s[0:3], null offen
3793; GFX12-NEXT:    s_wait_alu 0xfffe
3794; GFX12-NEXT:    s_not_b32 s6, s5
3795; GFX12-NEXT:    s_mov_b32 s5, 0
3796; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
3797; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3798; GFX12-NEXT:    s_wait_loadcnt 0x0
3799; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
3800; GFX12-NEXT:    s_wait_storecnt 0x0
3801; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3802; GFX12-NEXT:    v_add_f16_e32 v1, v1, v0
3803; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3804; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3805; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
3806; GFX12-NEXT:    s_wait_alu 0xfffe
3807; GFX12-NEXT:    v_and_or_b32 v1, v2, s6, v1
3808; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3809; GFX12-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
3810; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
3811; GFX12-NEXT:    s_wait_loadcnt 0x0
3812; GFX12-NEXT:    global_inv scope:SCOPE_DEV
3813; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
3814; GFX12-NEXT:    v_mov_b32_e32 v2, v4
3815; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
3816; GFX12-NEXT:    s_wait_alu 0xfffe
3817; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
3818; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
3819; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
3820; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
3821; GFX12-NEXT:    s_wait_alu 0xfffe
3822; GFX12-NEXT:    s_setpc_b64 s[30:31]
3823;
3824; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
3825; GFX940:       ; %bb.0:
3826; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3827; GFX940-NEXT:    s_addk_i32 s16, 0x200
3828; GFX940-NEXT:    s_and_b32 s4, s16, -4
3829; GFX940-NEXT:    v_mov_b32_e32 v1, s4
3830; GFX940-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
3831; GFX940-NEXT:    s_and_b32 s4, s16, 3
3832; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
3833; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
3834; GFX940-NEXT:    s_not_b32 s7, s4
3835; GFX940-NEXT:    s_mov_b64 s[4:5], 0
3836; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
3837; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
3838; GFX940-NEXT:    s_waitcnt vmcnt(0)
3839; GFX940-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
3840; GFX940-NEXT:    v_add_f16_e32 v2, v2, v0
3841; GFX940-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
3842; GFX940-NEXT:    v_and_or_b32 v2, v3, s7, v2
3843; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
3844; GFX940-NEXT:    buffer_wbl2 sc1
3845; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
3846; GFX940-NEXT:    s_waitcnt vmcnt(0)
3847; GFX940-NEXT:    buffer_inv sc1
3848; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
3849; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3850; GFX940-NEXT:    v_mov_b32_e32 v3, v4
3851; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3852; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
3853; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
3854; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
3855; GFX940-NEXT:    s_setpc_b64 s[30:31]
3856;
3857; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
3858; GFX11:       ; %bb.0:
3859; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3860; GFX11-NEXT:    s_addk_i32 s16, 0x200
3861; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3862; GFX11-NEXT:    s_and_b32 s4, s16, -4
3863; GFX11-NEXT:    v_mov_b32_e32 v3, s4
3864; GFX11-NEXT:    s_and_b32 s4, s16, 3
3865; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3866; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
3867; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
3868; GFX11-NEXT:    buffer_load_b32 v2, v3, s[0:3], 0 offen
3869; GFX11-NEXT:    s_not_b32 s6, s5
3870; GFX11-NEXT:    s_mov_b32 s5, 0
3871; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
3872; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3873; GFX11-NEXT:    s_waitcnt vmcnt(0)
3874; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
3875; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3876; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3877; GFX11-NEXT:    v_add_f16_e32 v1, v1, v0
3878; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3879; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3880; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
3881; GFX11-NEXT:    v_and_or_b32 v1, v2, s6, v1
3882; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3883; GFX11-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
3884; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
3885; GFX11-NEXT:    s_waitcnt vmcnt(0)
3886; GFX11-NEXT:    buffer_gl1_inv
3887; GFX11-NEXT:    buffer_gl0_inv
3888; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
3889; GFX11-NEXT:    v_mov_b32_e32 v2, v4
3890; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
3891; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3892; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
3893; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
3894; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
3895; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
3896; GFX11-NEXT:    s_setpc_b64 s[30:31]
3897;
3898; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
3899; GFX10:       ; %bb.0:
3900; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3901; GFX10-NEXT:    s_addk_i32 s20, 0x200
3902; GFX10-NEXT:    s_and_b32 s4, s20, -4
3903; GFX10-NEXT:    v_mov_b32_e32 v3, s4
3904; GFX10-NEXT:    s_and_b32 s4, s20, 3
3905; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
3906; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
3907; GFX10-NEXT:    buffer_load_dword v2, v3, s[16:19], 0 offen
3908; GFX10-NEXT:    s_not_b32 s6, s5
3909; GFX10-NEXT:    s_mov_b32 s5, 0
3910; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
3911; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3912; GFX10-NEXT:    s_waitcnt vmcnt(0)
3913; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
3914; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3915; GFX10-NEXT:    v_add_f16_e32 v1, v1, v0
3916; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
3917; GFX10-NEXT:    v_and_or_b32 v1, v2, s6, v1
3918; GFX10-NEXT:    v_mov_b32_e32 v5, v2
3919; GFX10-NEXT:    v_mov_b32_e32 v4, v1
3920; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
3921; GFX10-NEXT:    s_waitcnt vmcnt(0)
3922; GFX10-NEXT:    buffer_gl1_inv
3923; GFX10-NEXT:    buffer_gl0_inv
3924; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
3925; GFX10-NEXT:    v_mov_b32_e32 v2, v4
3926; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
3927; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
3928; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
3929; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3930; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
3931; GFX10-NEXT:    s_setpc_b64 s[30:31]
3932;
3933; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
3934; GFX90A:       ; %bb.0:
3935; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3936; GFX90A-NEXT:    s_addk_i32 s20, 0x200
3937; GFX90A-NEXT:    s_and_b32 s4, s20, -4
3938; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
3939; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen
3940; GFX90A-NEXT:    s_and_b32 s4, s20, 3
3941; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
3942; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
3943; GFX90A-NEXT:    s_not_b32 s7, s4
3944; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
3945; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
3946; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
3947; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3948; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
3949; GFX90A-NEXT:    v_add_f16_e32 v2, v2, v0
3950; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
3951; GFX90A-NEXT:    v_and_or_b32 v2, v3, s7, v2
3952; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
3953; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
3954; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3955; GFX90A-NEXT:    buffer_wbinvl1
3956; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
3957; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3958; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
3959; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3960; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
3961; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
3962; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3963; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3964;
3965; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
3966; GFX908:       ; %bb.0:
3967; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3968; GFX908-NEXT:    s_addk_i32 s20, 0x200
3969; GFX908-NEXT:    s_and_b32 s4, s20, -4
3970; GFX908-NEXT:    v_mov_b32_e32 v3, s4
3971; GFX908-NEXT:    buffer_load_dword v2, v3, s[16:19], 0 offen
3972; GFX908-NEXT:    s_and_b32 s4, s20, 3
3973; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
3974; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
3975; GFX908-NEXT:    s_not_b32 s7, s4
3976; GFX908-NEXT:    s_mov_b64 s[4:5], 0
3977; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
3978; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
3979; GFX908-NEXT:    s_waitcnt vmcnt(0)
3980; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s6, v2
3981; GFX908-NEXT:    v_add_f16_e32 v1, v1, v0
3982; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
3983; GFX908-NEXT:    v_and_or_b32 v1, v2, s7, v1
3984; GFX908-NEXT:    v_mov_b32_e32 v5, v2
3985; GFX908-NEXT:    v_mov_b32_e32 v4, v1
3986; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
3987; GFX908-NEXT:    s_waitcnt vmcnt(0)
3988; GFX908-NEXT:    buffer_wbinvl1
3989; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
3990; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3991; GFX908-NEXT:    v_mov_b32_e32 v2, v4
3992; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3993; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
3994; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
3995; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3996; GFX908-NEXT:    s_setpc_b64 s[30:31]
3997;
3998; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
3999; GFX8:       ; %bb.0:
4000; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4001; GFX8-NEXT:    s_addk_i32 s20, 0x200
4002; GFX8-NEXT:    s_and_b32 s4, s20, -4
4003; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4004; GFX8-NEXT:    buffer_load_dword v2, v3, s[16:19], 0 offen
4005; GFX8-NEXT:    s_and_b32 s4, s20, 3
4006; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
4007; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
4008; GFX8-NEXT:    s_not_b32 s7, s4
4009; GFX8-NEXT:    s_mov_b64 s[4:5], 0
4010; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
4011; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
4012; GFX8-NEXT:    s_waitcnt vmcnt(0)
4013; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s6, v2
4014; GFX8-NEXT:    v_add_f16_e32 v1, v1, v0
4015; GFX8-NEXT:    v_and_b32_e32 v4, s7, v2
4016; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
4017; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
4018; GFX8-NEXT:    v_mov_b32_e32 v5, v2
4019; GFX8-NEXT:    v_mov_b32_e32 v4, v1
4020; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
4021; GFX8-NEXT:    s_waitcnt vmcnt(0)
4022; GFX8-NEXT:    buffer_wbinvl1
4023; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
4024; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4025; GFX8-NEXT:    v_mov_b32_e32 v2, v4
4026; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4027; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
4028; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
4029; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4030; GFX8-NEXT:    s_setpc_b64 s[30:31]
4031;
4032; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
4033; GFX7:       ; %bb.0:
4034; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4035; GFX7-NEXT:    s_addk_i32 s20, 0x200
4036; GFX7-NEXT:    s_and_b32 s4, s20, -4
4037; GFX7-NEXT:    v_mov_b32_e32 v2, s4
4038; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
4039; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
4040; GFX7-NEXT:    s_and_b32 s4, s20, 3
4041; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
4042; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
4043; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
4044; GFX7-NEXT:    s_not_b32 s7, s4
4045; GFX7-NEXT:    s_mov_b64 s[4:5], 0
4046; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
4047; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
4048; GFX7-NEXT:    s_waitcnt vmcnt(0)
4049; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
4050; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
4051; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
4052; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
4053; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
4054; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
4055; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
4056; GFX7-NEXT:    v_mov_b32_e32 v5, v1
4057; GFX7-NEXT:    v_mov_b32_e32 v4, v0
4058; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
4059; GFX7-NEXT:    s_waitcnt vmcnt(0)
4060; GFX7-NEXT:    buffer_wbinvl1
4061; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
4062; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4063; GFX7-NEXT:    v_mov_b32_e32 v1, v4
4064; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4065; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
4066; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
4067; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4068; GFX7-NEXT:    s_setpc_b64 s[30:31]
4069;
4070; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
4071; GFX6:       ; %bb.0:
4072; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4073; GFX6-NEXT:    s_addk_i32 s20, 0x200
4074; GFX6-NEXT:    s_and_b32 s4, s20, -4
4075; GFX6-NEXT:    v_mov_b32_e32 v2, s4
4076; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
4077; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
4078; GFX6-NEXT:    s_and_b32 s4, s20, 3
4079; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
4080; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
4081; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
4082; GFX6-NEXT:    s_not_b32 s7, s4
4083; GFX6-NEXT:    s_mov_b64 s[4:5], 0
4084; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
4085; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
4086; GFX6-NEXT:    s_waitcnt vmcnt(0)
4087; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
4088; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
4089; GFX6-NEXT:    s_waitcnt expcnt(0)
4090; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
4091; GFX6-NEXT:    v_add_f32_e32 v0, v0, v3
4092; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
4093; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
4094; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
4095; GFX6-NEXT:    v_mov_b32_e32 v5, v1
4096; GFX6-NEXT:    v_mov_b32_e32 v4, v0
4097; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
4098; GFX6-NEXT:    s_waitcnt vmcnt(0)
4099; GFX6-NEXT:    buffer_wbinvl1
4100; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
4101; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4102; GFX6-NEXT:    v_mov_b32_e32 v1, v4
4103; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4104; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
4105; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
4106; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
4107; GFX6-NEXT:    s_waitcnt expcnt(0)
4108; GFX6-NEXT:    s_setpc_b64 s[30:31]
4109  %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
4110  %unused = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
4111  ret void
4112}
4113
4114define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 {
4115; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
4116; GFX12:       ; %bb.0:
4117; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4118; GFX12-NEXT:    s_wait_expcnt 0x0
4119; GFX12-NEXT:    s_wait_samplecnt 0x0
4120; GFX12-NEXT:    s_wait_bvhcnt 0x0
4121; GFX12-NEXT:    s_wait_kmcnt 0x0
4122; GFX12-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
4123; GFX12-NEXT:    s_mov_b32 s1, exec_lo
4124; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4125; GFX12-NEXT:    v_and_b32_e32 v4, 3, v6
4126; GFX12-NEXT:    v_and_b32_e32 v10, -4, v6
4127; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
4128; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4129; GFX12-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
4130; GFX12-NEXT:    v_not_b32_e32 v11, v7
4131; GFX12-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
4132; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
4133; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
4134; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
4135; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
4136; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
4137; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
4138; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
4139; GFX12-NEXT:    s_wait_alu 0xfffe
4140; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4141; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
4142; GFX12-NEXT:    s_wait_alu 0xfffe
4143; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
4144; GFX12-NEXT:    s_wait_loadcnt 0x0
4145; GFX12-NEXT:    buffer_load_b32 v7, v10, s[4:7], null offen
4146; GFX12-NEXT:    s_wait_alu 0xfffe
4147; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
4148; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
4149; GFX12-NEXT:  ; %bb.2:
4150; GFX12-NEXT:    s_mov_b32 exec_lo, s1
4151; GFX12-NEXT:    s_mov_b32 s1, 0
4152; GFX12-NEXT:  .LBB15_3: ; %atomicrmw.start
4153; GFX12-NEXT:    ; =>This Loop Header: Depth=1
4154; GFX12-NEXT:    ; Child Loop BB15_4 Depth 2
4155; GFX12-NEXT:    s_wait_loadcnt 0x0
4156; GFX12-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
4157; GFX12-NEXT:    s_mov_b32 s2, exec_lo
4158; GFX12-NEXT:    s_wait_storecnt 0x0
4159; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4160; GFX12-NEXT:    v_add_f16_e32 v6, v6, v5
4161; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff, v6
4162; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4163; GFX12-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
4164; GFX12-NEXT:    v_and_or_b32 v6, v7, v11, v6
4165; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4166; GFX12-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
4167; GFX12-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
4168; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
4169; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
4170; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
4171; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
4172; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
4173; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
4174; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
4175; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
4176; GFX12-NEXT:    s_wait_alu 0xfffe
4177; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4178; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
4179; GFX12-NEXT:    s_wait_alu 0xfffe
4180; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
4181; GFX12-NEXT:    s_wait_loadcnt 0x0
4182; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
4183; GFX12-NEXT:    s_wait_alu 0xfffe
4184; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
4185; GFX12-NEXT:    s_cbranch_execnz .LBB15_4
4186; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4187; GFX12-NEXT:    s_mov_b32 exec_lo, s2
4188; GFX12-NEXT:    s_wait_loadcnt 0x0
4189; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
4190; GFX12-NEXT:    v_mov_b32_e32 v7, v8
4191; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4192; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
4193; GFX12-NEXT:    s_wait_alu 0xfffe
4194; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4195; GFX12-NEXT:    s_cbranch_execnz .LBB15_3
4196; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
4197; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4198; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
4199; GFX12-NEXT:    s_wait_alu 0xfffe
4200; GFX12-NEXT:    s_setpc_b64 s[30:31]
4201;
4202; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
4203; GFX940:       ; %bb.0:
4204; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4205; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
4206; GFX940-NEXT:    v_and_b32_e32 v10, -4, v4
4207; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
4208; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
4209; GFX940-NEXT:    s_mov_b32 s0, 0xffff
4210; GFX940-NEXT:    v_lshlrev_b32_e64 v6, v4, s0
4211; GFX940-NEXT:    v_not_b32_e32 v11, v6
4212; GFX940-NEXT:    s_mov_b64 s[2:3], exec
4213; GFX940-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
4214; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
4215; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
4216; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
4217; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
4218; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
4219; GFX940-NEXT:    s_nop 0
4220; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
4221; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
4222; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
4223; GFX940-NEXT:    buffer_load_dword v7, v10, s[4:7], 0 offen
4224; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
4225; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
4226; GFX940-NEXT:  ; %bb.2:
4227; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
4228; GFX940-NEXT:    s_mov_b64 s[2:3], 0
4229; GFX940-NEXT:  .LBB15_3: ; %atomicrmw.start
4230; GFX940-NEXT:    ; =>This Loop Header: Depth=1
4231; GFX940-NEXT:    ; Child Loop BB15_4 Depth 2
4232; GFX940-NEXT:    s_waitcnt vmcnt(0)
4233; GFX940-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
4234; GFX940-NEXT:    v_add_f16_e32 v6, v6, v5
4235; GFX940-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
4236; GFX940-NEXT:    v_and_or_b32 v6, v7, v11, v6
4237; GFX940-NEXT:    s_mov_b64 s[8:9], exec
4238; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
4239; GFX940-NEXT:    buffer_wbl2 sc1
4240; GFX940-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
4241; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
4242; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
4243; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
4244; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
4245; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
4246; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
4247; GFX940-NEXT:    s_nop 0
4248; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
4249; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
4250; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
4251; GFX940-NEXT:    s_waitcnt vmcnt(0)
4252; GFX940-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
4253; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
4254; GFX940-NEXT:    s_cbranch_execnz .LBB15_4
4255; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4256; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
4257; GFX940-NEXT:    s_waitcnt vmcnt(0)
4258; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
4259; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
4260; GFX940-NEXT:    v_mov_b32_e32 v7, v8
4261; GFX940-NEXT:    buffer_inv sc1
4262; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
4263; GFX940-NEXT:    s_cbranch_execnz .LBB15_3
4264; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
4265; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
4266; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
4267; GFX940-NEXT:    s_setpc_b64 s[30:31]
4268;
4269; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
4270; GFX11:       ; %bb.0:
4271; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4272; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
4273; GFX11-NEXT:    s_mov_b32 s1, 0
4274; GFX11-NEXT:    s_mov_b32 s2, exec_lo
4275; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4276; GFX11-NEXT:    v_and_b32_e32 v4, 3, v6
4277; GFX11-NEXT:    v_and_b32_e32 v10, -4, v6
4278; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
4279; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4280; GFX11-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
4281; GFX11-NEXT:    v_not_b32_e32 v11, v7
4282; GFX11-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
4283; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
4284; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
4285; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
4286; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
4287; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
4288; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
4289; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
4290; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
4291; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
4292; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
4293; GFX11-NEXT:    buffer_load_b32 v7, v10, s[4:7], 0 offen
4294; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
4295; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
4296; GFX11-NEXT:  ; %bb.2:
4297; GFX11-NEXT:    s_mov_b32 exec_lo, s2
4298; GFX11-NEXT:    .p2align 6
4299; GFX11-NEXT:  .LBB15_3: ; %atomicrmw.start
4300; GFX11-NEXT:    ; =>This Loop Header: Depth=1
4301; GFX11-NEXT:    ; Child Loop BB15_4 Depth 2
4302; GFX11-NEXT:    s_waitcnt vmcnt(0)
4303; GFX11-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
4304; GFX11-NEXT:    s_mov_b32 s2, exec_lo
4305; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4306; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4307; GFX11-NEXT:    v_add_f16_e32 v6, v6, v5
4308; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
4309; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4310; GFX11-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
4311; GFX11-NEXT:    v_and_or_b32 v6, v7, v11, v6
4312; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4313; GFX11-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
4314; GFX11-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
4315; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
4316; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
4317; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
4318; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
4319; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
4320; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
4321; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
4322; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
4323; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
4324; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
4325; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
4326; GFX11-NEXT:    s_waitcnt vmcnt(0)
4327; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
4328; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
4329; GFX11-NEXT:    s_cbranch_execnz .LBB15_4
4330; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4331; GFX11-NEXT:    s_mov_b32 exec_lo, s2
4332; GFX11-NEXT:    s_waitcnt vmcnt(0)
4333; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
4334; GFX11-NEXT:    v_mov_b32_e32 v7, v8
4335; GFX11-NEXT:    buffer_gl1_inv
4336; GFX11-NEXT:    buffer_gl0_inv
4337; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
4338; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4339; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4340; GFX11-NEXT:    s_cbranch_execnz .LBB15_3
4341; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
4342; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4343; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
4344; GFX11-NEXT:    s_setpc_b64 s[30:31]
4345;
4346; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
4347; GFX10:       ; %bb.0:
4348; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4349; GFX10-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
4350; GFX10-NEXT:    s_mov_b32 s5, 0
4351; GFX10-NEXT:    s_mov_b32 s6, exec_lo
4352; GFX10-NEXT:    v_and_b32_e32 v4, 3, v6
4353; GFX10-NEXT:    v_and_b32_e32 v10, -4, v6
4354; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
4355; GFX10-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
4356; GFX10-NEXT:    v_not_b32_e32 v11, v7
4357; GFX10-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
4358; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
4359; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
4360; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
4361; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
4362; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
4363; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
4364; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
4365; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
4366; GFX10-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
4367; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4368; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
4369; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
4370; GFX10-NEXT:  ; %bb.2:
4371; GFX10-NEXT:    s_mov_b32 exec_lo, s6
4372; GFX10-NEXT:  .LBB15_3: ; %atomicrmw.start
4373; GFX10-NEXT:    ; =>This Loop Header: Depth=1
4374; GFX10-NEXT:    ; Child Loop BB15_4 Depth 2
4375; GFX10-NEXT:    s_waitcnt vmcnt(0)
4376; GFX10-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
4377; GFX10-NEXT:    s_mov_b32 s6, exec_lo
4378; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4379; GFX10-NEXT:    v_add_f16_e32 v6, v6, v5
4380; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
4381; GFX10-NEXT:    v_and_or_b32 v6, v7, v11, v6
4382; GFX10-NEXT:    v_mov_b32_e32 v9, v7
4383; GFX10-NEXT:    v_mov_b32_e32 v8, v6
4384; GFX10-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
4385; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
4386; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
4387; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
4388; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
4389; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
4390; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
4391; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
4392; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
4393; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
4394; GFX10-NEXT:    s_waitcnt vmcnt(0)
4395; GFX10-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
4396; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4397; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
4398; GFX10-NEXT:    s_cbranch_execnz .LBB15_4
4399; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4400; GFX10-NEXT:    s_mov_b32 exec_lo, s6
4401; GFX10-NEXT:    s_waitcnt vmcnt(0)
4402; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
4403; GFX10-NEXT:    v_mov_b32_e32 v7, v8
4404; GFX10-NEXT:    buffer_gl1_inv
4405; GFX10-NEXT:    buffer_gl0_inv
4406; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
4407; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4408; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
4409; GFX10-NEXT:    s_cbranch_execnz .LBB15_3
4410; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
4411; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
4412; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
4413; GFX10-NEXT:    s_setpc_b64 s[30:31]
4414;
4415; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
4416; GFX90A:       ; %bb.0:
4417; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4418; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
4419; GFX90A-NEXT:    v_and_b32_e32 v10, -4, v4
4420; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
4421; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
4422; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
4423; GFX90A-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
4424; GFX90A-NEXT:    v_not_b32_e32 v11, v6
4425; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
4426; GFX90A-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
4427; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
4428; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
4429; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
4430; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
4431; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4432; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4433; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
4434; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
4435; GFX90A-NEXT:    s_nop 0
4436; GFX90A-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
4437; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
4438; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
4439; GFX90A-NEXT:  ; %bb.2:
4440; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
4441; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
4442; GFX90A-NEXT:  .LBB15_3: ; %atomicrmw.start
4443; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
4444; GFX90A-NEXT:    ; Child Loop BB15_4 Depth 2
4445; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4446; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
4447; GFX90A-NEXT:    v_add_f16_e32 v6, v6, v5
4448; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
4449; GFX90A-NEXT:    v_and_or_b32 v6, v7, v11, v6
4450; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
4451; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
4452; GFX90A-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
4453; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
4454; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
4455; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
4456; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
4457; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
4458; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4459; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4460; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
4461; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
4462; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4463; GFX90A-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
4464; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
4465; GFX90A-NEXT:    s_cbranch_execnz .LBB15_4
4466; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4467; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
4468; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4469; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
4470; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4471; GFX90A-NEXT:    v_mov_b32_e32 v7, v8
4472; GFX90A-NEXT:    buffer_wbinvl1
4473; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4474; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
4475; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
4476; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
4477; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
4478; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4479;
4480; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
4481; GFX908:       ; %bb.0:
4482; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4483; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
4484; GFX908-NEXT:    v_and_b32_e32 v10, -4, v4
4485; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
4486; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
4487; GFX908-NEXT:    s_mov_b32 s4, 0xffff
4488; GFX908-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
4489; GFX908-NEXT:    v_not_b32_e32 v11, v6
4490; GFX908-NEXT:    s_mov_b64 s[6:7], exec
4491; GFX908-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
4492; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
4493; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
4494; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
4495; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
4496; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4497; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4498; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
4499; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
4500; GFX908-NEXT:    s_nop 0
4501; GFX908-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
4502; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
4503; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
4504; GFX908-NEXT:  ; %bb.2:
4505; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
4506; GFX908-NEXT:    s_mov_b64 s[6:7], 0
4507; GFX908-NEXT:  .LBB15_3: ; %atomicrmw.start
4508; GFX908-NEXT:    ; =>This Loop Header: Depth=1
4509; GFX908-NEXT:    ; Child Loop BB15_4 Depth 2
4510; GFX908-NEXT:    s_waitcnt vmcnt(0)
4511; GFX908-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
4512; GFX908-NEXT:    v_add_f16_e32 v6, v6, v5
4513; GFX908-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
4514; GFX908-NEXT:    v_and_or_b32 v6, v7, v11, v6
4515; GFX908-NEXT:    v_mov_b32_e32 v9, v7
4516; GFX908-NEXT:    s_mov_b64 s[12:13], exec
4517; GFX908-NEXT:    v_mov_b32_e32 v8, v6
4518; GFX908-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
4519; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
4520; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
4521; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
4522; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
4523; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
4524; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4525; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4526; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
4527; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
4528; GFX908-NEXT:    s_waitcnt vmcnt(0)
4529; GFX908-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
4530; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
4531; GFX908-NEXT:    s_cbranch_execnz .LBB15_4
4532; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4533; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
4534; GFX908-NEXT:    s_waitcnt vmcnt(0)
4535; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
4536; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4537; GFX908-NEXT:    v_mov_b32_e32 v7, v8
4538; GFX908-NEXT:    buffer_wbinvl1
4539; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4540; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
4541; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
4542; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
4543; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
4544; GFX908-NEXT:    s_setpc_b64 s[30:31]
4545;
4546; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
4547; GFX8:       ; %bb.0:
4548; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4549; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
4550; GFX8-NEXT:    v_and_b32_e32 v10, -4, v4
4551; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
4552; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
4553; GFX8-NEXT:    s_mov_b32 s4, 0xffff
4554; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
4555; GFX8-NEXT:    v_not_b32_e32 v11, v6
4556; GFX8-NEXT:    s_mov_b64 s[6:7], exec
4557; GFX8-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
4558; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
4559; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
4560; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
4561; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
4562; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4563; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4564; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
4565; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
4566; GFX8-NEXT:    s_nop 0
4567; GFX8-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
4568; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
4569; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
4570; GFX8-NEXT:  ; %bb.2:
4571; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
4572; GFX8-NEXT:    s_mov_b64 s[6:7], 0
4573; GFX8-NEXT:  .LBB15_3: ; %atomicrmw.start
4574; GFX8-NEXT:    ; =>This Loop Header: Depth=1
4575; GFX8-NEXT:    ; Child Loop BB15_4 Depth 2
4576; GFX8-NEXT:    s_waitcnt vmcnt(0)
4577; GFX8-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
4578; GFX8-NEXT:    v_add_f16_e32 v6, v6, v5
4579; GFX8-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
4580; GFX8-NEXT:    v_and_b32_e32 v8, v7, v11
4581; GFX8-NEXT:    v_or_b32_e32 v6, v8, v6
4582; GFX8-NEXT:    v_mov_b32_e32 v9, v7
4583; GFX8-NEXT:    s_mov_b64 s[12:13], exec
4584; GFX8-NEXT:    v_mov_b32_e32 v8, v6
4585; GFX8-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
4586; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
4587; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
4588; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
4589; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
4590; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
4591; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4592; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4593; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
4594; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
4595; GFX8-NEXT:    s_waitcnt vmcnt(0)
4596; GFX8-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
4597; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
4598; GFX8-NEXT:    s_cbranch_execnz .LBB15_4
4599; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4600; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
4601; GFX8-NEXT:    s_waitcnt vmcnt(0)
4602; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
4603; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4604; GFX8-NEXT:    v_mov_b32_e32 v7, v8
4605; GFX8-NEXT:    buffer_wbinvl1
4606; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4607; GFX8-NEXT:    s_cbranch_execnz .LBB15_3
4608; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
4609; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
4610; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
4611; GFX8-NEXT:    s_setpc_b64 s[30:31]
4612;
4613; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
4614; GFX7:       ; %bb.0:
4615; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4616; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
4617; GFX7-NEXT:    v_and_b32_e32 v8, -4, v4
4618; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
4619; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
4620; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
4621; GFX7-NEXT:    v_not_b32_e32 v9, v4
4622; GFX7-NEXT:    s_mov_b64 s[6:7], exec
4623; GFX7-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
4624; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
4625; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
4626; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
4627; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
4628; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4629; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4630; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
4631; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
4632; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
4633; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
4634; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
4635; GFX7-NEXT:  ; %bb.2:
4636; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
4637; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v5
4638; GFX7-NEXT:    s_mov_b64 s[6:7], 0
4639; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v4
4640; GFX7-NEXT:  .LBB15_3: ; %atomicrmw.start
4641; GFX7-NEXT:    ; =>This Loop Header: Depth=1
4642; GFX7-NEXT:    ; Child Loop BB15_4 Depth 2
4643; GFX7-NEXT:    s_waitcnt vmcnt(0)
4644; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
4645; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
4646; GFX7-NEXT:    v_and_b32_e32 v5, v6, v9
4647; GFX7-NEXT:    s_mov_b64 s[12:13], exec
4648; GFX7-NEXT:    v_add_f32_e32 v4, v4, v10
4649; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
4650; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
4651; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
4652; GFX7-NEXT:    v_mov_b32_e32 v4, v5
4653; GFX7-NEXT:    v_mov_b32_e32 v5, v6
4654; GFX7-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
4655; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
4656; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
4657; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
4658; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
4659; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
4660; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4661; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4662; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
4663; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
4664; GFX7-NEXT:    s_waitcnt vmcnt(0)
4665; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
4666; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
4667; GFX7-NEXT:    s_cbranch_execnz .LBB15_4
4668; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4669; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
4670; GFX7-NEXT:    s_waitcnt vmcnt(0)
4671; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
4672; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4673; GFX7-NEXT:    v_mov_b32_e32 v6, v4
4674; GFX7-NEXT:    buffer_wbinvl1
4675; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4676; GFX7-NEXT:    s_cbranch_execnz .LBB15_3
4677; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
4678; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
4679; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
4680; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
4681; GFX7-NEXT:    s_setpc_b64 s[30:31]
4682;
4683; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
4684; GFX6:       ; %bb.0:
4685; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4686; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
4687; GFX6-NEXT:    v_and_b32_e32 v8, -4, v4
4688; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
4689; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
4690; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
4691; GFX6-NEXT:    v_not_b32_e32 v9, v4
4692; GFX6-NEXT:    s_mov_b64 s[6:7], exec
4693; GFX6-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
4694; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
4695; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
4696; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
4697; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
4698; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4699; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4700; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
4701; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
4702; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
4703; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
4704; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
4705; GFX6-NEXT:  ; %bb.2:
4706; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
4707; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v5
4708; GFX6-NEXT:    s_mov_b64 s[6:7], 0
4709; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v4
4710; GFX6-NEXT:  .LBB15_3: ; %atomicrmw.start
4711; GFX6-NEXT:    ; =>This Loop Header: Depth=1
4712; GFX6-NEXT:    ; Child Loop BB15_4 Depth 2
4713; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4714; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
4715; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
4716; GFX6-NEXT:    v_and_b32_e32 v5, v6, v9
4717; GFX6-NEXT:    s_mov_b64 s[12:13], exec
4718; GFX6-NEXT:    v_add_f32_e32 v4, v4, v10
4719; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
4720; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
4721; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
4722; GFX6-NEXT:    v_mov_b32_e32 v4, v5
4723; GFX6-NEXT:    v_mov_b32_e32 v5, v6
4724; GFX6-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
4725; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
4726; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
4727; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
4728; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
4729; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
4730; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4731; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4732; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
4733; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
4734; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4735; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
4736; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
4737; GFX6-NEXT:    s_cbranch_execnz .LBB15_4
4738; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4739; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
4740; GFX6-NEXT:    s_waitcnt vmcnt(0)
4741; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
4742; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4743; GFX6-NEXT:    v_mov_b32_e32 v6, v4
4744; GFX6-NEXT:    buffer_wbinvl1
4745; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4746; GFX6-NEXT:    s_cbranch_execnz .LBB15_3
4747; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
4748; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
4749; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
4750; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
4751; GFX6-NEXT:    s_waitcnt expcnt(0)
4752; GFX6-NEXT:    s_setpc_b64 s[30:31]
4753  %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
4754  %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
4755  ret half %result
4756}
4757
4758; --------------------------------------------------------------------
4759; bfloat
4760; --------------------------------------------------------------------
4761
4762define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
4763; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
4764; GFX12:       ; %bb.0:
4765; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4766; GFX12-NEXT:    s_wait_expcnt 0x0
4767; GFX12-NEXT:    s_wait_samplecnt 0x0
4768; GFX12-NEXT:    s_wait_bvhcnt 0x0
4769; GFX12-NEXT:    s_wait_kmcnt 0x0
4770; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
4771; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
4772; GFX12-NEXT:    s_wait_alu 0xfffe
4773; GFX12-NEXT:    s_and_b32 s4, s16, -4
4774; GFX12-NEXT:    s_wait_alu 0xfffe
4775; GFX12-NEXT:    v_mov_b32_e32 v4, s4
4776; GFX12-NEXT:    s_and_b32 s4, s16, 3
4777; GFX12-NEXT:    s_wait_alu 0xfffe
4778; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
4779; GFX12-NEXT:    s_wait_alu 0xfffe
4780; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
4781; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
4782; GFX12-NEXT:    s_wait_alu 0xfffe
4783; GFX12-NEXT:    s_not_b32 s6, s5
4784; GFX12-NEXT:    s_mov_b32 s5, 0
4785; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
4786; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
4787; GFX12-NEXT:    s_wait_loadcnt 0x0
4788; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
4789; GFX12-NEXT:    s_wait_storecnt 0x0
4790; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4791; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
4792; GFX12-NEXT:    v_add_f32_e32 v0, v0, v5
4793; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
4794; GFX12-NEXT:    v_bfe_u32 v2, v0, 16, 1
4795; GFX12-NEXT:    v_or_b32_e32 v3, 0x400000, v0
4796; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
4797; GFX12-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
4798; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4799; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
4800; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4801; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
4802; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
4803; GFX12-NEXT:    s_wait_alu 0xfffe
4804; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
4805; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4806; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
4807; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
4808; GFX12-NEXT:    s_wait_loadcnt 0x0
4809; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4810; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
4811; GFX12-NEXT:    v_mov_b32_e32 v1, v2
4812; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
4813; GFX12-NEXT:    s_wait_alu 0xfffe
4814; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
4815; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
4816; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
4817; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
4818; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
4819; GFX12-NEXT:    s_wait_alu 0xfffe
4820; GFX12-NEXT:    s_setpc_b64 s[30:31]
4821;
4822; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
4823; GFX940:       ; %bb.0:
4824; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4825; GFX940-NEXT:    s_addk_i32 s16, 0x200
4826; GFX940-NEXT:    s_and_b32 s4, s16, -4
4827; GFX940-NEXT:    v_mov_b32_e32 v4, s4
4828; GFX940-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
4829; GFX940-NEXT:    s_and_b32 s4, s16, 3
4830; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
4831; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
4832; GFX940-NEXT:    s_not_b32 s7, s4
4833; GFX940-NEXT:    s_mov_b64 s[4:5], 0
4834; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
4835; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
4836; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
4837; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
4838; GFX940-NEXT:    s_waitcnt vmcnt(0)
4839; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4840; GFX940-NEXT:    buffer_wbl2 sc1
4841; GFX940-NEXT:    v_add_f32_e32 v0, v0, v5
4842; GFX940-NEXT:    v_bfe_u32 v2, v0, 16, 1
4843; GFX940-NEXT:    v_or_b32_e32 v3, 0x400000, v0
4844; GFX940-NEXT:    v_add3_u32 v2, v2, v0, s8
4845; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
4846; GFX940-NEXT:    s_nop 1
4847; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
4848; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4849; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
4850; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
4851; GFX940-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
4852; GFX940-NEXT:    s_waitcnt vmcnt(0)
4853; GFX940-NEXT:    buffer_inv sc1
4854; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
4855; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4856; GFX940-NEXT:    v_mov_b32_e32 v1, v2
4857; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4858; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
4859; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
4860; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
4861; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
4862; GFX940-NEXT:    s_setpc_b64 s[30:31]
4863;
4864; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
4865; GFX11:       ; %bb.0:
4866; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4867; GFX11-NEXT:    s_addk_i32 s16, 0x200
4868; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
4869; GFX11-NEXT:    s_and_b32 s4, s16, -4
4870; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4871; GFX11-NEXT:    v_mov_b32_e32 v4, s4
4872; GFX11-NEXT:    s_and_b32 s4, s16, 3
4873; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
4874; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4875; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
4876; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
4877; GFX11-NEXT:    s_not_b32 s6, s5
4878; GFX11-NEXT:    s_mov_b32 s5, 0
4879; GFX11-NEXT:    .p2align 6
4880; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
4881; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
4882; GFX11-NEXT:    s_waitcnt vmcnt(0)
4883; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
4884; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4885; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4886; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
4887; GFX11-NEXT:    v_add_f32_e32 v0, v0, v5
4888; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
4889; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
4890; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
4891; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
4892; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
4893; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4894; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
4895; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4896; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4897; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
4898; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
4899; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4900; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
4901; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
4902; GFX11-NEXT:    s_waitcnt vmcnt(0)
4903; GFX11-NEXT:    buffer_gl1_inv
4904; GFX11-NEXT:    buffer_gl0_inv
4905; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
4906; GFX11-NEXT:    v_mov_b32_e32 v1, v2
4907; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
4908; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4909; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
4910; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
4911; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
4912; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
4913; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
4914; GFX11-NEXT:    s_setpc_b64 s[30:31]
4915;
4916; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
4917; GFX10:       ; %bb.0:
4918; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4919; GFX10-NEXT:    s_addk_i32 s20, 0x200
4920; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
4921; GFX10-NEXT:    s_and_b32 s4, s20, -4
4922; GFX10-NEXT:    v_mov_b32_e32 v4, s4
4923; GFX10-NEXT:    s_and_b32 s4, s20, 3
4924; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
4925; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
4926; GFX10-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
4927; GFX10-NEXT:    s_not_b32 s6, s5
4928; GFX10-NEXT:    s_mov_b32 s5, 0
4929; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
4930; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
4931; GFX10-NEXT:    s_waitcnt vmcnt(0)
4932; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4933; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4934; GFX10-NEXT:    v_add_f32_e32 v0, v0, v5
4935; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
4936; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
4937; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
4938; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
4939; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
4940; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4941; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
4942; GFX10-NEXT:    v_mov_b32_e32 v3, v1
4943; GFX10-NEXT:    v_mov_b32_e32 v2, v0
4944; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
4945; GFX10-NEXT:    s_waitcnt vmcnt(0)
4946; GFX10-NEXT:    buffer_gl1_inv
4947; GFX10-NEXT:    buffer_gl0_inv
4948; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
4949; GFX10-NEXT:    v_mov_b32_e32 v1, v2
4950; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
4951; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
4952; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
4953; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
4954; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
4955; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
4956; GFX10-NEXT:    s_setpc_b64 s[30:31]
4957;
4958; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
4959; GFX90A:       ; %bb.0:
4960; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4961; GFX90A-NEXT:    s_addk_i32 s20, 0x200
4962; GFX90A-NEXT:    s_and_b32 s4, s20, -4
4963; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
4964; GFX90A-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
4965; GFX90A-NEXT:    s_and_b32 s4, s20, 3
4966; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
4967; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
4968; GFX90A-NEXT:    s_not_b32 s7, s4
4969; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
4970; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
4971; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
4972; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
4973; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
4974; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4975; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4976; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v5
4977; GFX90A-NEXT:    v_bfe_u32 v2, v0, 16, 1
4978; GFX90A-NEXT:    v_or_b32_e32 v3, 0x400000, v0
4979; GFX90A-NEXT:    v_add3_u32 v2, v2, v0, s8
4980; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
4981; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
4982; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4983; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
4984; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
4985; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
4986; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4987; GFX90A-NEXT:    buffer_wbinvl1
4988; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
4989; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4990; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
4991; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4992; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
4993; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
4994; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4995; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
4996; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4997;
4998; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
4999; GFX908:       ; %bb.0:
5000; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5001; GFX908-NEXT:    s_addk_i32 s20, 0x200
5002; GFX908-NEXT:    s_and_b32 s4, s20, -4
5003; GFX908-NEXT:    v_mov_b32_e32 v4, s4
5004; GFX908-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
5005; GFX908-NEXT:    s_and_b32 s4, s20, 3
5006; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
5007; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
5008; GFX908-NEXT:    s_not_b32 s7, s4
5009; GFX908-NEXT:    s_mov_b64 s[4:5], 0
5010; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
5011; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
5012; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
5013; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
5014; GFX908-NEXT:    s_waitcnt vmcnt(0)
5015; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5016; GFX908-NEXT:    v_add_f32_e32 v0, v0, v5
5017; GFX908-NEXT:    v_bfe_u32 v2, v0, 16, 1
5018; GFX908-NEXT:    v_or_b32_e32 v3, 0x400000, v0
5019; GFX908-NEXT:    v_add3_u32 v2, v2, v0, s8
5020; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
5021; GFX908-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
5022; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5023; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
5024; GFX908-NEXT:    v_mov_b32_e32 v3, v1
5025; GFX908-NEXT:    v_mov_b32_e32 v2, v0
5026; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
5027; GFX908-NEXT:    s_waitcnt vmcnt(0)
5028; GFX908-NEXT:    buffer_wbinvl1
5029; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
5030; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5031; GFX908-NEXT:    v_mov_b32_e32 v1, v2
5032; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5033; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
5034; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
5035; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5036; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
5037; GFX908-NEXT:    s_setpc_b64 s[30:31]
5038;
5039; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
5040; GFX8:       ; %bb.0:
5041; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5042; GFX8-NEXT:    s_addk_i32 s20, 0x200
5043; GFX8-NEXT:    s_and_b32 s4, s20, -4
5044; GFX8-NEXT:    v_mov_b32_e32 v4, s4
5045; GFX8-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
5046; GFX8-NEXT:    s_and_b32 s4, s20, 3
5047; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
5048; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
5049; GFX8-NEXT:    s_not_b32 s7, s4
5050; GFX8-NEXT:    s_mov_b64 s[4:5], 0
5051; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
5052; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
5053; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5054; GFX8-NEXT:    v_mov_b32_e32 v0, s6
5055; GFX8-NEXT:    s_waitcnt vmcnt(0)
5056; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5057; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
5058; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
5059; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
5060; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
5061; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
5062; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
5063; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
5064; GFX8-NEXT:    v_and_b32_e32 v2, s7, v1
5065; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5066; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
5067; GFX8-NEXT:    v_mov_b32_e32 v3, v1
5068; GFX8-NEXT:    v_mov_b32_e32 v2, v0
5069; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
5070; GFX8-NEXT:    s_waitcnt vmcnt(0)
5071; GFX8-NEXT:    buffer_wbinvl1
5072; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
5073; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5074; GFX8-NEXT:    v_mov_b32_e32 v1, v2
5075; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5076; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
5077; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
5078; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5079; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
5080; GFX8-NEXT:    s_setpc_b64 s[30:31]
5081;
5082; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
5083; GFX7:       ; %bb.0:
5084; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5085; GFX7-NEXT:    s_addk_i32 s20, 0x200
5086; GFX7-NEXT:    s_and_b32 s4, s20, -4
5087; GFX7-NEXT:    v_mov_b32_e32 v4, s4
5088; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
5089; GFX7-NEXT:    s_and_b32 s4, s20, 3
5090; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
5091; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
5092; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
5093; GFX7-NEXT:    s_not_b32 s7, s4
5094; GFX7-NEXT:    s_mov_b64 s[4:5], 0
5095; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
5096; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
5097; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
5098; GFX7-NEXT:    s_waitcnt vmcnt(0)
5099; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
5100; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
5101; GFX7-NEXT:    v_add_f32_e32 v0, v0, v5
5102; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
5103; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
5104; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
5105; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
5106; GFX7-NEXT:    v_mov_b32_e32 v3, v1
5107; GFX7-NEXT:    v_mov_b32_e32 v2, v0
5108; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
5109; GFX7-NEXT:    s_waitcnt vmcnt(0)
5110; GFX7-NEXT:    buffer_wbinvl1
5111; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
5112; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5113; GFX7-NEXT:    v_mov_b32_e32 v1, v2
5114; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5115; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
5116; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
5117; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5118; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
5119; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
5120; GFX7-NEXT:    s_setpc_b64 s[30:31]
5121;
5122; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
5123; GFX6:       ; %bb.0:
5124; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5125; GFX6-NEXT:    s_addk_i32 s20, 0x200
5126; GFX6-NEXT:    s_and_b32 s4, s20, -4
5127; GFX6-NEXT:    v_mov_b32_e32 v4, s4
5128; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
5129; GFX6-NEXT:    s_and_b32 s4, s20, 3
5130; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
5131; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
5132; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
5133; GFX6-NEXT:    s_not_b32 s7, s4
5134; GFX6-NEXT:    s_mov_b64 s[4:5], 0
5135; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
5136; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
5137; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
5138; GFX6-NEXT:    s_waitcnt vmcnt(0)
5139; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
5140; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
5141; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
5142; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
5143; GFX6-NEXT:    s_waitcnt expcnt(0)
5144; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
5145; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
5146; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
5147; GFX6-NEXT:    v_mov_b32_e32 v3, v1
5148; GFX6-NEXT:    v_mov_b32_e32 v2, v0
5149; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
5150; GFX6-NEXT:    s_waitcnt vmcnt(0)
5151; GFX6-NEXT:    buffer_wbinvl1
5152; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
5153; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5154; GFX6-NEXT:    v_mov_b32_e32 v1, v2
5155; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5156; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
5157; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
5158; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
5159; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
5160; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
5161; GFX6-NEXT:    s_waitcnt expcnt(0)
5162; GFX6-NEXT:    s_setpc_b64 s[30:31]
5163  %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
5164  %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
5165  ret bfloat %result
5166}
5167
5168define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
5169; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
5170; GFX12:       ; %bb.0:
5171; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5172; GFX12-NEXT:    s_wait_expcnt 0x0
5173; GFX12-NEXT:    s_wait_samplecnt 0x0
5174; GFX12-NEXT:    s_wait_bvhcnt 0x0
5175; GFX12-NEXT:    s_wait_kmcnt 0x0
5176; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
5177; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
5178; GFX12-NEXT:    s_wait_alu 0xfffe
5179; GFX12-NEXT:    s_and_b32 s4, s16, -4
5180; GFX12-NEXT:    s_wait_alu 0xfffe
5181; GFX12-NEXT:    v_mov_b32_e32 v2, s4
5182; GFX12-NEXT:    s_and_b32 s4, s16, 3
5183; GFX12-NEXT:    s_wait_alu 0xfffe
5184; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
5185; GFX12-NEXT:    s_wait_alu 0xfffe
5186; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
5187; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
5188; GFX12-NEXT:    s_wait_alu 0xfffe
5189; GFX12-NEXT:    s_not_b32 s6, s5
5190; GFX12-NEXT:    s_mov_b32 s5, 0
5191; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
5192; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
5193; GFX12-NEXT:    s_wait_loadcnt 0x0
5194; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
5195; GFX12-NEXT:    s_wait_storecnt 0x0
5196; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5197; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
5198; GFX12-NEXT:    v_add_f32_e32 v0, v0, v3
5199; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
5200; GFX12-NEXT:    v_bfe_u32 v4, v0, 16, 1
5201; GFX12-NEXT:    v_or_b32_e32 v5, 0x400000, v0
5202; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
5203; GFX12-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
5204; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5205; GFX12-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
5206; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
5207; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
5208; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
5209; GFX12-NEXT:    s_wait_alu 0xfffe
5210; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
5211; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5212; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
5213; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
5214; GFX12-NEXT:    s_wait_loadcnt 0x0
5215; GFX12-NEXT:    global_inv scope:SCOPE_DEV
5216; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
5217; GFX12-NEXT:    v_mov_b32_e32 v1, v4
5218; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
5219; GFX12-NEXT:    s_wait_alu 0xfffe
5220; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
5221; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
5222; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
5223; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
5224; GFX12-NEXT:    s_wait_alu 0xfffe
5225; GFX12-NEXT:    s_setpc_b64 s[30:31]
5226;
5227; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
5228; GFX940:       ; %bb.0:
5229; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5230; GFX940-NEXT:    s_addk_i32 s16, 0x200
5231; GFX940-NEXT:    s_and_b32 s4, s16, -4
5232; GFX940-NEXT:    v_mov_b32_e32 v2, s4
5233; GFX940-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
5234; GFX940-NEXT:    s_and_b32 s4, s16, 3
5235; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
5236; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
5237; GFX940-NEXT:    s_not_b32 s7, s4
5238; GFX940-NEXT:    s_mov_b64 s[4:5], 0
5239; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
5240; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
5241; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
5242; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
5243; GFX940-NEXT:    s_waitcnt vmcnt(0)
5244; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5245; GFX940-NEXT:    buffer_wbl2 sc1
5246; GFX940-NEXT:    v_add_f32_e32 v0, v0, v3
5247; GFX940-NEXT:    v_bfe_u32 v4, v0, 16, 1
5248; GFX940-NEXT:    v_or_b32_e32 v5, 0x400000, v0
5249; GFX940-NEXT:    v_add3_u32 v4, v4, v0, s8
5250; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
5251; GFX940-NEXT:    s_nop 1
5252; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
5253; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5254; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
5255; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
5256; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
5257; GFX940-NEXT:    s_waitcnt vmcnt(0)
5258; GFX940-NEXT:    buffer_inv sc1
5259; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
5260; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5261; GFX940-NEXT:    v_mov_b32_e32 v1, v4
5262; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5263; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
5264; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
5265; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
5266; GFX940-NEXT:    s_setpc_b64 s[30:31]
5267;
5268; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
5269; GFX11:       ; %bb.0:
5270; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5271; GFX11-NEXT:    s_addk_i32 s16, 0x200
5272; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
5273; GFX11-NEXT:    s_and_b32 s4, s16, -4
5274; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
5275; GFX11-NEXT:    v_mov_b32_e32 v2, s4
5276; GFX11-NEXT:    s_and_b32 s4, s16, 3
5277; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
5278; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5279; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
5280; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
5281; GFX11-NEXT:    s_not_b32 s6, s5
5282; GFX11-NEXT:    s_mov_b32 s5, 0
5283; GFX11-NEXT:    .p2align 6
5284; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
5285; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
5286; GFX11-NEXT:    s_waitcnt vmcnt(0)
5287; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
5288; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5289; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5290; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
5291; GFX11-NEXT:    v_add_f32_e32 v0, v0, v3
5292; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
5293; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
5294; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
5295; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
5296; GFX11-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
5297; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5298; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
5299; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
5300; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5301; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
5302; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
5303; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5304; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
5305; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
5306; GFX11-NEXT:    s_waitcnt vmcnt(0)
5307; GFX11-NEXT:    buffer_gl1_inv
5308; GFX11-NEXT:    buffer_gl0_inv
5309; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
5310; GFX11-NEXT:    v_mov_b32_e32 v1, v4
5311; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
5312; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5313; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
5314; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
5315; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
5316; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
5317; GFX11-NEXT:    s_setpc_b64 s[30:31]
5318;
5319; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
5320; GFX10:       ; %bb.0:
5321; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5322; GFX10-NEXT:    s_addk_i32 s20, 0x200
5323; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
5324; GFX10-NEXT:    s_and_b32 s4, s20, -4
5325; GFX10-NEXT:    v_mov_b32_e32 v2, s4
5326; GFX10-NEXT:    s_and_b32 s4, s20, 3
5327; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
5328; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
5329; GFX10-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
5330; GFX10-NEXT:    s_not_b32 s6, s5
5331; GFX10-NEXT:    s_mov_b32 s5, 0
5332; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
5333; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
5334; GFX10-NEXT:    s_waitcnt vmcnt(0)
5335; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5336; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5337; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
5338; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
5339; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
5340; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
5341; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
5342; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
5343; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5344; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
5345; GFX10-NEXT:    v_mov_b32_e32 v5, v1
5346; GFX10-NEXT:    v_mov_b32_e32 v4, v0
5347; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
5348; GFX10-NEXT:    s_waitcnt vmcnt(0)
5349; GFX10-NEXT:    buffer_gl1_inv
5350; GFX10-NEXT:    buffer_gl0_inv
5351; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
5352; GFX10-NEXT:    v_mov_b32_e32 v1, v4
5353; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
5354; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
5355; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
5356; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
5357; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
5358; GFX10-NEXT:    s_setpc_b64 s[30:31]
5359;
5360; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
5361; GFX90A:       ; %bb.0:
5362; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5363; GFX90A-NEXT:    s_addk_i32 s20, 0x200
5364; GFX90A-NEXT:    s_and_b32 s4, s20, -4
5365; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
5366; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
5367; GFX90A-NEXT:    s_and_b32 s4, s20, 3
5368; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
5369; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
5370; GFX90A-NEXT:    s_not_b32 s7, s4
5371; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
5372; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
5373; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
5374; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
5375; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
5376; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5377; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5378; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v3
5379; GFX90A-NEXT:    v_bfe_u32 v4, v0, 16, 1
5380; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v0
5381; GFX90A-NEXT:    v_add3_u32 v4, v4, v0, s8
5382; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
5383; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
5384; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5385; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
5386; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
5387; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
5388; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5389; GFX90A-NEXT:    buffer_wbinvl1
5390; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
5391; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5392; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
5393; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5394; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
5395; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
5396; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5397; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5398;
5399; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
5400; GFX908:       ; %bb.0:
5401; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5402; GFX908-NEXT:    s_addk_i32 s20, 0x200
5403; GFX908-NEXT:    s_and_b32 s4, s20, -4
5404; GFX908-NEXT:    v_mov_b32_e32 v2, s4
5405; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
5406; GFX908-NEXT:    s_and_b32 s4, s20, 3
5407; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
5408; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
5409; GFX908-NEXT:    s_not_b32 s7, s4
5410; GFX908-NEXT:    s_mov_b64 s[4:5], 0
5411; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
5412; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
5413; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
5414; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
5415; GFX908-NEXT:    s_waitcnt vmcnt(0)
5416; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5417; GFX908-NEXT:    v_add_f32_e32 v0, v0, v3
5418; GFX908-NEXT:    v_bfe_u32 v4, v0, 16, 1
5419; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v0
5420; GFX908-NEXT:    v_add3_u32 v4, v4, v0, s8
5421; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
5422; GFX908-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
5423; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5424; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
5425; GFX908-NEXT:    v_mov_b32_e32 v5, v1
5426; GFX908-NEXT:    v_mov_b32_e32 v4, v0
5427; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
5428; GFX908-NEXT:    s_waitcnt vmcnt(0)
5429; GFX908-NEXT:    buffer_wbinvl1
5430; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
5431; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5432; GFX908-NEXT:    v_mov_b32_e32 v1, v4
5433; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5434; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
5435; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
5436; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5437; GFX908-NEXT:    s_setpc_b64 s[30:31]
5438;
5439; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
5440; GFX8:       ; %bb.0:
5441; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5442; GFX8-NEXT:    s_addk_i32 s20, 0x200
5443; GFX8-NEXT:    s_and_b32 s4, s20, -4
5444; GFX8-NEXT:    v_mov_b32_e32 v2, s4
5445; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
5446; GFX8-NEXT:    s_and_b32 s4, s20, 3
5447; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
5448; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
5449; GFX8-NEXT:    s_not_b32 s7, s4
5450; GFX8-NEXT:    s_mov_b64 s[4:5], 0
5451; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
5452; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
5453; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5454; GFX8-NEXT:    v_mov_b32_e32 v0, s6
5455; GFX8-NEXT:    s_waitcnt vmcnt(0)
5456; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5457; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
5458; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
5459; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
5460; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
5461; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
5462; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
5463; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
5464; GFX8-NEXT:    v_and_b32_e32 v4, s7, v1
5465; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5466; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
5467; GFX8-NEXT:    v_mov_b32_e32 v5, v1
5468; GFX8-NEXT:    v_mov_b32_e32 v4, v0
5469; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
5470; GFX8-NEXT:    s_waitcnt vmcnt(0)
5471; GFX8-NEXT:    buffer_wbinvl1
5472; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
5473; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5474; GFX8-NEXT:    v_mov_b32_e32 v1, v4
5475; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5476; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
5477; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
5478; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5479; GFX8-NEXT:    s_setpc_b64 s[30:31]
5480;
5481; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
5482; GFX7:       ; %bb.0:
5483; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5484; GFX7-NEXT:    s_addk_i32 s20, 0x200
5485; GFX7-NEXT:    s_and_b32 s4, s20, -4
5486; GFX7-NEXT:    v_mov_b32_e32 v2, s4
5487; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
5488; GFX7-NEXT:    s_and_b32 s4, s20, 3
5489; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
5490; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
5491; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
5492; GFX7-NEXT:    s_not_b32 s7, s4
5493; GFX7-NEXT:    s_mov_b64 s[4:5], 0
5494; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
5495; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
5496; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
5497; GFX7-NEXT:    s_waitcnt vmcnt(0)
5498; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
5499; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
5500; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
5501; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
5502; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
5503; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
5504; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
5505; GFX7-NEXT:    v_mov_b32_e32 v5, v1
5506; GFX7-NEXT:    v_mov_b32_e32 v4, v0
5507; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
5508; GFX7-NEXT:    s_waitcnt vmcnt(0)
5509; GFX7-NEXT:    buffer_wbinvl1
5510; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
5511; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5512; GFX7-NEXT:    v_mov_b32_e32 v1, v4
5513; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5514; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
5515; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
5516; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5517; GFX7-NEXT:    s_setpc_b64 s[30:31]
5518;
5519; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
5520; GFX6:       ; %bb.0:
5521; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5522; GFX6-NEXT:    s_addk_i32 s20, 0x200
5523; GFX6-NEXT:    s_and_b32 s4, s20, -4
5524; GFX6-NEXT:    v_mov_b32_e32 v2, s4
5525; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
5526; GFX6-NEXT:    s_and_b32 s4, s20, 3
5527; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
5528; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
5529; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
5530; GFX6-NEXT:    s_not_b32 s7, s4
5531; GFX6-NEXT:    s_mov_b64 s[4:5], 0
5532; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
5533; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
5534; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
5535; GFX6-NEXT:    s_waitcnt vmcnt(0)
5536; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
5537; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
5538; GFX6-NEXT:    v_add_f32_e32 v0, v0, v3
5539; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
5540; GFX6-NEXT:    s_waitcnt expcnt(0)
5541; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
5542; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
5543; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
5544; GFX6-NEXT:    v_mov_b32_e32 v5, v1
5545; GFX6-NEXT:    v_mov_b32_e32 v4, v0
5546; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
5547; GFX6-NEXT:    s_waitcnt vmcnt(0)
5548; GFX6-NEXT:    buffer_wbinvl1
5549; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
5550; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5551; GFX6-NEXT:    v_mov_b32_e32 v1, v4
5552; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5553; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
5554; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
5555; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
5556; GFX6-NEXT:    s_waitcnt expcnt(0)
5557; GFX6-NEXT:    s_setpc_b64 s[30:31]
5558  %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
5559  %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
5560  ret void
5561}
5562
5563define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
5564; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
5565; GFX12:       ; %bb.0:
5566; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5567; GFX12-NEXT:    s_wait_expcnt 0x0
5568; GFX12-NEXT:    s_wait_samplecnt 0x0
5569; GFX12-NEXT:    s_wait_bvhcnt 0x0
5570; GFX12-NEXT:    s_wait_kmcnt 0x0
5571; GFX12-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
5572; GFX12-NEXT:    s_mov_b32 s1, exec_lo
5573; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5574; GFX12-NEXT:    v_and_b32_e32 v6, 3, v4
5575; GFX12-NEXT:    v_and_b32_e32 v8, -4, v4
5576; GFX12-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
5577; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5578; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
5579; GFX12-NEXT:    v_not_b32_e32 v9, v6
5580; GFX12-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
5581; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
5582; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
5583; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
5584; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
5585; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5586; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
5587; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
5588; GFX12-NEXT:    s_wait_alu 0xfffe
5589; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5590; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
5591; GFX12-NEXT:    s_wait_alu 0xfffe
5592; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
5593; GFX12-NEXT:    s_wait_loadcnt 0x0
5594; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
5595; GFX12-NEXT:    s_wait_alu 0xfffe
5596; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
5597; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
5598; GFX12-NEXT:  ; %bb.2:
5599; GFX12-NEXT:    s_mov_b32 exec_lo, s1
5600; GFX12-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
5601; GFX12-NEXT:    s_mov_b32 s1, 0
5602; GFX12-NEXT:  .LBB18_3: ; %atomicrmw.start
5603; GFX12-NEXT:    ; =>This Loop Header: Depth=1
5604; GFX12-NEXT:    ; Child Loop BB18_4 Depth 2
5605; GFX12-NEXT:    s_wait_loadcnt 0x0
5606; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
5607; GFX12-NEXT:    s_mov_b32 s2, exec_lo
5608; GFX12-NEXT:    s_wait_storecnt 0x0
5609; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5610; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5611; GFX12-NEXT:    v_add_f32_e32 v4, v4, v10
5612; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
5613; GFX12-NEXT:    v_bfe_u32 v5, v4, 16, 1
5614; GFX12-NEXT:    v_or_b32_e32 v11, 0x400000, v4
5615; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
5616; GFX12-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
5617; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5618; GFX12-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
5619; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
5620; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5621; GFX12-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
5622; GFX12-NEXT:    v_and_or_b32 v5, v6, v9, v4
5623; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5624; GFX12-NEXT:    v_mov_b32_e32 v4, v5
5625; GFX12-NEXT:    v_mov_b32_e32 v5, v6
5626; GFX12-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
5627; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
5628; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
5629; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
5630; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
5631; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
5632; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5633; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
5634; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
5635; GFX12-NEXT:    s_wait_alu 0xfffe
5636; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5637; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
5638; GFX12-NEXT:    s_wait_alu 0xfffe
5639; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
5640; GFX12-NEXT:    s_wait_loadcnt 0x0
5641; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
5642; GFX12-NEXT:    s_wait_alu 0xfffe
5643; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
5644; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
5645; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
5646; GFX12-NEXT:    s_mov_b32 exec_lo, s2
5647; GFX12-NEXT:    s_wait_loadcnt 0x0
5648; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
5649; GFX12-NEXT:    v_mov_b32_e32 v6, v4
5650; GFX12-NEXT:    global_inv scope:SCOPE_DEV
5651; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
5652; GFX12-NEXT:    s_wait_alu 0xfffe
5653; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
5654; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
5655; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
5656; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
5657; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
5658; GFX12-NEXT:    s_wait_alu 0xfffe
5659; GFX12-NEXT:    s_setpc_b64 s[30:31]
5660;
5661; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
5662; GFX940:       ; %bb.0:
5663; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5664; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
5665; GFX940-NEXT:    v_and_b32_e32 v9, -4, v4
5666; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
5667; GFX940-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
5668; GFX940-NEXT:    s_mov_b32 s0, 0xffff
5669; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
5670; GFX940-NEXT:    v_not_b32_e32 v10, v4
5671; GFX940-NEXT:    s_mov_b64 s[2:3], exec
5672; GFX940-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
5673; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
5674; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
5675; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
5676; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
5677; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
5678; GFX940-NEXT:    s_nop 0
5679; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
5680; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
5681; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
5682; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
5683; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
5684; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
5685; GFX940-NEXT:  ; %bb.2:
5686; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
5687; GFX940-NEXT:    s_mov_b64 s[2:3], 0
5688; GFX940-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
5689; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
5690; GFX940-NEXT:  .LBB18_3: ; %atomicrmw.start
5691; GFX940-NEXT:    ; =>This Loop Header: Depth=1
5692; GFX940-NEXT:    ; Child Loop BB18_4 Depth 2
5693; GFX940-NEXT:    s_waitcnt vmcnt(0)
5694; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5695; GFX940-NEXT:    s_mov_b64 s[8:9], exec
5696; GFX940-NEXT:    v_add_f32_e32 v4, v4, v11
5697; GFX940-NEXT:    v_bfe_u32 v5, v4, 16, 1
5698; GFX940-NEXT:    v_add3_u32 v5, v5, v4, s10
5699; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v4
5700; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
5701; GFX940-NEXT:    buffer_wbl2 sc1
5702; GFX940-NEXT:    s_nop 0
5703; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
5704; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5705; GFX940-NEXT:    v_and_or_b32 v6, v7, v10, v4
5706; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
5707; GFX940-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
5708; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
5709; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
5710; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
5711; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
5712; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
5713; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
5714; GFX940-NEXT:    s_nop 0
5715; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
5716; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
5717; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
5718; GFX940-NEXT:    s_waitcnt vmcnt(0)
5719; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
5720; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
5721; GFX940-NEXT:    s_cbranch_execnz .LBB18_4
5722; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
5723; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
5724; GFX940-NEXT:    s_waitcnt vmcnt(0)
5725; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
5726; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
5727; GFX940-NEXT:    v_mov_b32_e32 v7, v4
5728; GFX940-NEXT:    buffer_inv sc1
5729; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
5730; GFX940-NEXT:    s_cbranch_execnz .LBB18_3
5731; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
5732; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
5733; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
5734; GFX940-NEXT:    s_setpc_b64 s[30:31]
5735;
5736; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
5737; GFX11:       ; %bb.0:
5738; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5739; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
5740; GFX11-NEXT:    s_mov_b32 s1, 0
5741; GFX11-NEXT:    s_mov_b32 s2, exec_lo
5742; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5743; GFX11-NEXT:    v_and_b32_e32 v6, 3, v4
5744; GFX11-NEXT:    v_and_b32_e32 v8, -4, v4
5745; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
5746; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5747; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
5748; GFX11-NEXT:    v_not_b32_e32 v9, v6
5749; GFX11-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
5750; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
5751; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
5752; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
5753; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
5754; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5755; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
5756; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
5757; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5758; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
5759; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
5760; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
5761; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
5762; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
5763; GFX11-NEXT:  ; %bb.2:
5764; GFX11-NEXT:    s_mov_b32 exec_lo, s2
5765; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
5766; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
5767; GFX11-NEXT:    .p2align 6
5768; GFX11-NEXT:  .LBB18_3: ; %atomicrmw.start
5769; GFX11-NEXT:    ; =>This Loop Header: Depth=1
5770; GFX11-NEXT:    ; Child Loop BB18_4 Depth 2
5771; GFX11-NEXT:    s_waitcnt vmcnt(0)
5772; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
5773; GFX11-NEXT:    s_mov_b32 s2, exec_lo
5774; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5775; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5776; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5777; GFX11-NEXT:    v_add_f32_e32 v4, v4, v10
5778; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
5779; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
5780; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
5781; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
5782; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
5783; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5784; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
5785; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
5786; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5787; GFX11-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
5788; GFX11-NEXT:    v_and_or_b32 v5, v6, v9, v4
5789; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5790; GFX11-NEXT:    v_mov_b32_e32 v4, v5
5791; GFX11-NEXT:    v_mov_b32_e32 v5, v6
5792; GFX11-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
5793; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
5794; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
5795; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
5796; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
5797; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
5798; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5799; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
5800; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
5801; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5802; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
5803; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
5804; GFX11-NEXT:    s_waitcnt vmcnt(0)
5805; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
5806; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
5807; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
5808; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
5809; GFX11-NEXT:    s_mov_b32 exec_lo, s2
5810; GFX11-NEXT:    s_waitcnt vmcnt(0)
5811; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
5812; GFX11-NEXT:    v_mov_b32_e32 v6, v4
5813; GFX11-NEXT:    buffer_gl1_inv
5814; GFX11-NEXT:    buffer_gl0_inv
5815; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
5816; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5817; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
5818; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
5819; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
5820; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
5821; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
5822; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
5823; GFX11-NEXT:    s_setpc_b64 s[30:31]
5824;
5825; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
5826; GFX10:       ; %bb.0:
5827; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5828; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
5829; GFX10-NEXT:    s_mov_b32 s5, 0
5830; GFX10-NEXT:    s_mov_b32 s6, exec_lo
5831; GFX10-NEXT:    v_and_b32_e32 v6, 3, v4
5832; GFX10-NEXT:    v_and_b32_e32 v8, -4, v4
5833; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
5834; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
5835; GFX10-NEXT:    v_not_b32_e32 v9, v6
5836; GFX10-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
5837; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
5838; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
5839; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
5840; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
5841; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
5842; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
5843; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
5844; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
5845; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
5846; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
5847; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
5848; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
5849; GFX10-NEXT:  ; %bb.2:
5850; GFX10-NEXT:    s_mov_b32 exec_lo, s6
5851; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
5852; GFX10-NEXT:  .LBB18_3: ; %atomicrmw.start
5853; GFX10-NEXT:    ; =>This Loop Header: Depth=1
5854; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
5855; GFX10-NEXT:    s_waitcnt vmcnt(0)
5856; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5857; GFX10-NEXT:    s_mov_b32 s6, exec_lo
5858; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5859; GFX10-NEXT:    v_add_f32_e32 v4, v4, v10
5860; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
5861; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v4
5862; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
5863; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
5864; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
5865; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5866; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
5867; GFX10-NEXT:    v_mov_b32_e32 v4, v5
5868; GFX10-NEXT:    v_mov_b32_e32 v5, v6
5869; GFX10-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
5870; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
5871; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
5872; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
5873; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
5874; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
5875; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
5876; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
5877; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
5878; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
5879; GFX10-NEXT:    s_waitcnt vmcnt(0)
5880; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
5881; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
5882; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
5883; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
5884; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
5885; GFX10-NEXT:    s_mov_b32 exec_lo, s6
5886; GFX10-NEXT:    s_waitcnt vmcnt(0)
5887; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
5888; GFX10-NEXT:    v_mov_b32_e32 v6, v4
5889; GFX10-NEXT:    buffer_gl1_inv
5890; GFX10-NEXT:    buffer_gl0_inv
5891; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
5892; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
5893; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
5894; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
5895; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
5896; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
5897; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
5898; GFX10-NEXT:    s_setpc_b64 s[30:31]
5899;
5900; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
5901; GFX90A:       ; %bb.0:
5902; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5903; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
5904; GFX90A-NEXT:    v_and_b32_e32 v9, -4, v4
5905; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
5906; GFX90A-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
5907; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
5908; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
5909; GFX90A-NEXT:    v_not_b32_e32 v10, v4
5910; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
5911; GFX90A-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
5912; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
5913; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
5914; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
5915; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
5916; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5917; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5918; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
5919; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
5920; GFX90A-NEXT:    s_nop 0
5921; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
5922; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
5923; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
5924; GFX90A-NEXT:  ; %bb.2:
5925; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
5926; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
5927; GFX90A-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
5928; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
5929; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.start
5930; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
5931; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
5932; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5933; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5934; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v11
5935; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
5936; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
5937; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
5938; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
5939; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
5940; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5941; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
5942; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
5943; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
5944; GFX90A-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
5945; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
5946; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
5947; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
5948; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
5949; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
5950; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5951; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5952; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
5953; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
5954; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5955; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
5956; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
5957; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
5958; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
5959; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
5960; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5961; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
5962; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5963; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
5964; GFX90A-NEXT:    buffer_wbinvl1
5965; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5966; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
5967; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
5968; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
5969; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
5970; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5971;
5972; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
5973; GFX908:       ; %bb.0:
5974; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5975; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
5976; GFX908-NEXT:    v_and_b32_e32 v8, -4, v4
5977; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
5978; GFX908-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
5979; GFX908-NEXT:    s_mov_b32 s4, 0xffff
5980; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
5981; GFX908-NEXT:    v_not_b32_e32 v9, v4
5982; GFX908-NEXT:    s_mov_b64 s[6:7], exec
5983; GFX908-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
5984; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
5985; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
5986; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
5987; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
5988; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5989; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5990; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
5991; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
5992; GFX908-NEXT:    s_nop 0
5993; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
5994; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
5995; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
5996; GFX908-NEXT:  ; %bb.2:
5997; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
5998; GFX908-NEXT:    s_mov_b64 s[6:7], 0
5999; GFX908-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
6000; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
6001; GFX908-NEXT:  .LBB18_3: ; %atomicrmw.start
6002; GFX908-NEXT:    ; =>This Loop Header: Depth=1
6003; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
6004; GFX908-NEXT:    s_waitcnt vmcnt(0)
6005; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
6006; GFX908-NEXT:    v_add_f32_e32 v4, v4, v10
6007; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
6008; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
6009; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v4
6010; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
6011; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc
6012; GFX908-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
6013; GFX908-NEXT:    v_and_or_b32 v5, v6, v9, v4
6014; GFX908-NEXT:    v_mov_b32_e32 v4, v5
6015; GFX908-NEXT:    s_mov_b64 s[12:13], exec
6016; GFX908-NEXT:    v_mov_b32_e32 v5, v6
6017; GFX908-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
6018; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
6019; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
6020; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
6021; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
6022; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
6023; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6024; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6025; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6026; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6027; GFX908-NEXT:    s_waitcnt vmcnt(0)
6028; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
6029; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
6030; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
6031; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
6032; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
6033; GFX908-NEXT:    s_waitcnt vmcnt(0)
6034; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
6035; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6036; GFX908-NEXT:    v_mov_b32_e32 v6, v4
6037; GFX908-NEXT:    buffer_wbinvl1
6038; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6039; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
6040; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
6041; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
6042; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
6043; GFX908-NEXT:    s_setpc_b64 s[30:31]
6044;
6045; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
6046; GFX8:       ; %bb.0:
6047; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6048; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
6049; GFX8-NEXT:    v_and_b32_e32 v8, -4, v4
6050; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
6051; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
6052; GFX8-NEXT:    s_mov_b32 s4, 0xffff
6053; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
6054; GFX8-NEXT:    v_not_b32_e32 v9, v4
6055; GFX8-NEXT:    s_mov_b64 s[6:7], exec
6056; GFX8-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
6057; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
6058; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
6059; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
6060; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
6061; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6062; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6063; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6064; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6065; GFX8-NEXT:    s_nop 0
6066; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
6067; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
6068; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
6069; GFX8-NEXT:  ; %bb.2:
6070; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
6071; GFX8-NEXT:    s_mov_b64 s[6:7], 0
6072; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
6073; GFX8-NEXT:  .LBB18_3: ; %atomicrmw.start
6074; GFX8-NEXT:    ; =>This Loop Header: Depth=1
6075; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
6076; GFX8-NEXT:    s_waitcnt vmcnt(0)
6077; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
6078; GFX8-NEXT:    v_add_f32_e32 v4, v4, v10
6079; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
6080; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
6081; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
6082; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v4
6083; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
6084; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc
6085; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
6086; GFX8-NEXT:    v_and_b32_e32 v5, v6, v9
6087; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
6088; GFX8-NEXT:    v_mov_b32_e32 v4, v5
6089; GFX8-NEXT:    s_mov_b64 s[12:13], exec
6090; GFX8-NEXT:    v_mov_b32_e32 v5, v6
6091; GFX8-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
6092; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
6093; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
6094; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
6095; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
6096; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
6097; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6098; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6099; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6100; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6101; GFX8-NEXT:    s_waitcnt vmcnt(0)
6102; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
6103; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
6104; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
6105; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
6106; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
6107; GFX8-NEXT:    s_waitcnt vmcnt(0)
6108; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
6109; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6110; GFX8-NEXT:    v_mov_b32_e32 v6, v4
6111; GFX8-NEXT:    buffer_wbinvl1
6112; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6113; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
6114; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
6115; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
6116; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
6117; GFX8-NEXT:    s_setpc_b64 s[30:31]
6118;
6119; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
6120; GFX7:       ; %bb.0:
6121; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6122; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
6123; GFX7-NEXT:    v_and_b32_e32 v8, -4, v4
6124; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
6125; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
6126; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
6127; GFX7-NEXT:    v_not_b32_e32 v9, v4
6128; GFX7-NEXT:    s_mov_b64 s[6:7], exec
6129; GFX7-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
6130; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
6131; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
6132; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
6133; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
6134; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6135; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6136; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6137; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6138; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
6139; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
6140; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
6141; GFX7-NEXT:  ; %bb.2:
6142; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
6143; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
6144; GFX7-NEXT:    s_mov_b64 s[6:7], 0
6145; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
6146; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.start
6147; GFX7-NEXT:    ; =>This Loop Header: Depth=1
6148; GFX7-NEXT:    ; Child Loop BB18_4 Depth 2
6149; GFX7-NEXT:    s_waitcnt vmcnt(0)
6150; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
6151; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
6152; GFX7-NEXT:    v_add_f32_e32 v4, v4, v10
6153; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
6154; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
6155; GFX7-NEXT:    v_and_b32_e32 v5, v6, v9
6156; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
6157; GFX7-NEXT:    v_mov_b32_e32 v4, v5
6158; GFX7-NEXT:    s_mov_b64 s[12:13], exec
6159; GFX7-NEXT:    v_mov_b32_e32 v5, v6
6160; GFX7-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
6161; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
6162; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
6163; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
6164; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
6165; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
6166; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6167; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6168; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6169; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6170; GFX7-NEXT:    s_waitcnt vmcnt(0)
6171; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
6172; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
6173; GFX7-NEXT:    s_cbranch_execnz .LBB18_4
6174; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
6175; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
6176; GFX7-NEXT:    s_waitcnt vmcnt(0)
6177; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
6178; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6179; GFX7-NEXT:    v_mov_b32_e32 v6, v4
6180; GFX7-NEXT:    buffer_wbinvl1
6181; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6182; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
6183; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
6184; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
6185; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
6186; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
6187; GFX7-NEXT:    s_setpc_b64 s[30:31]
6188;
6189; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
6190; GFX6:       ; %bb.0:
6191; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6192; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
6193; GFX6-NEXT:    v_and_b32_e32 v8, -4, v4
6194; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
6195; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
6196; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
6197; GFX6-NEXT:    v_not_b32_e32 v9, v4
6198; GFX6-NEXT:    s_mov_b64 s[6:7], exec
6199; GFX6-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
6200; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
6201; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
6202; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
6203; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
6204; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6205; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6206; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6207; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6208; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
6209; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
6210; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
6211; GFX6-NEXT:  ; %bb.2:
6212; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
6213; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v5
6214; GFX6-NEXT:    s_mov_b64 s[6:7], 0
6215; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
6216; GFX6-NEXT:  .LBB18_3: ; %atomicrmw.start
6217; GFX6-NEXT:    ; =>This Loop Header: Depth=1
6218; GFX6-NEXT:    ; Child Loop BB18_4 Depth 2
6219; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6220; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
6221; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
6222; GFX6-NEXT:    v_add_f32_e32 v4, v4, v10
6223; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
6224; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
6225; GFX6-NEXT:    v_and_b32_e32 v5, v6, v9
6226; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
6227; GFX6-NEXT:    v_mov_b32_e32 v4, v5
6228; GFX6-NEXT:    s_mov_b64 s[12:13], exec
6229; GFX6-NEXT:    v_mov_b32_e32 v5, v6
6230; GFX6-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
6231; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
6232; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
6233; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
6234; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
6235; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
6236; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6237; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6238; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6239; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6240; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6241; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
6242; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
6243; GFX6-NEXT:    s_cbranch_execnz .LBB18_4
6244; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
6245; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
6246; GFX6-NEXT:    s_waitcnt vmcnt(0)
6247; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
6248; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6249; GFX6-NEXT:    v_mov_b32_e32 v6, v4
6250; GFX6-NEXT:    buffer_wbinvl1
6251; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6252; GFX6-NEXT:    s_cbranch_execnz .LBB18_3
6253; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
6254; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
6255; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
6256; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
6257; GFX6-NEXT:    s_waitcnt expcnt(0)
6258; GFX6-NEXT:    s_setpc_b64 s[30:31]
6259  %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
6260  %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
6261  ret bfloat %result
6262}
6263
6264; --------------------------------------------------------------------
6265; <2 x half>
6266; --------------------------------------------------------------------
6267
6268define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
6269; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
6270; GFX12:       ; %bb.0:
6271; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6272; GFX12-NEXT:    s_wait_expcnt 0x0
6273; GFX12-NEXT:    s_wait_samplecnt 0x0
6274; GFX12-NEXT:    s_wait_bvhcnt 0x0
6275; GFX12-NEXT:    s_wait_kmcnt 0x0
6276; GFX12-NEXT:    v_mov_b32_e32 v1, s16
6277; GFX12-NEXT:    s_wait_storecnt 0x0
6278; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
6279; GFX12-NEXT:    s_wait_loadcnt 0x0
6280; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6281; GFX12-NEXT:    s_setpc_b64 s[30:31]
6282;
6283; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
6284; GFX940:       ; %bb.0:
6285; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6286; GFX940-NEXT:    v_mov_b32_e32 v1, s16
6287; GFX940-NEXT:    buffer_wbl2 sc1
6288; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
6289; GFX940-NEXT:    s_waitcnt vmcnt(0)
6290; GFX940-NEXT:    buffer_inv sc1
6291; GFX940-NEXT:    s_setpc_b64 s[30:31]
6292;
6293; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
6294; GFX11:       ; %bb.0:
6295; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6296; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
6297; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6298; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
6299; GFX11-NEXT:    v_mov_b32_e32 v0, s16
6300; GFX11-NEXT:    s_mov_b32 s4, 0
6301; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
6302; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
6303; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
6304; GFX11-NEXT:    s_waitcnt vmcnt(0)
6305; GFX11-NEXT:    v_mov_b32_e32 v5, v0
6306; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
6307; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6308; GFX11-NEXT:    v_pk_add_f16 v4, v5, v2
6309; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
6310; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
6311; GFX11-NEXT:    s_waitcnt vmcnt(0)
6312; GFX11-NEXT:    buffer_gl1_inv
6313; GFX11-NEXT:    buffer_gl0_inv
6314; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
6315; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
6316; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6317; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
6318; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
6319; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
6320; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6321; GFX11-NEXT:    s_setpc_b64 s[30:31]
6322;
6323; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
6324; GFX10:       ; %bb.0:
6325; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6326; GFX10-NEXT:    v_mov_b32_e32 v2, v0
6327; GFX10-NEXT:    v_mov_b32_e32 v0, s20
6328; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
6329; GFX10-NEXT:    v_mov_b32_e32 v3, s4
6330; GFX10-NEXT:    s_mov_b32 s4, 0
6331; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
6332; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
6333; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
6334; GFX10-NEXT:    s_waitcnt vmcnt(0)
6335; GFX10-NEXT:    v_mov_b32_e32 v5, v0
6336; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
6337; GFX10-NEXT:    v_pk_add_f16 v4, v5, v2
6338; GFX10-NEXT:    v_mov_b32_e32 v0, v4
6339; GFX10-NEXT:    v_mov_b32_e32 v1, v5
6340; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
6341; GFX10-NEXT:    s_waitcnt vmcnt(0)
6342; GFX10-NEXT:    buffer_gl1_inv
6343; GFX10-NEXT:    buffer_gl0_inv
6344; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
6345; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
6346; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
6347; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
6348; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
6349; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6350; GFX10-NEXT:    s_setpc_b64 s[30:31]
6351;
6352; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
6353; GFX90A:       ; %bb.0:
6354; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6355; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
6356; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[16:19], 0 offen offset:1024 glc
6357; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6358; GFX90A-NEXT:    buffer_wbinvl1
6359; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6360;
6361; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
6362; GFX908:       ; %bb.0:
6363; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6364; GFX908-NEXT:    v_mov_b32_e32 v2, v0
6365; GFX908-NEXT:    v_mov_b32_e32 v0, s20
6366; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
6367; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
6368; GFX908-NEXT:    s_mov_b64 s[4:5], 0
6369; GFX908-NEXT:    v_mov_b32_e32 v3, s6
6370; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
6371; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
6372; GFX908-NEXT:    s_waitcnt vmcnt(0)
6373; GFX908-NEXT:    v_mov_b32_e32 v5, v0
6374; GFX908-NEXT:    v_pk_add_f16 v4, v5, v2
6375; GFX908-NEXT:    v_mov_b32_e32 v0, v4
6376; GFX908-NEXT:    v_mov_b32_e32 v1, v5
6377; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
6378; GFX908-NEXT:    s_waitcnt vmcnt(0)
6379; GFX908-NEXT:    buffer_wbinvl1
6380; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
6381; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6382; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6383; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
6384; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
6385; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
6386; GFX908-NEXT:    s_setpc_b64 s[30:31]
6387;
6388; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
6389; GFX8:       ; %bb.0:
6390; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6391; GFX8-NEXT:    v_mov_b32_e32 v2, v0
6392; GFX8-NEXT:    v_mov_b32_e32 v0, s20
6393; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
6394; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
6395; GFX8-NEXT:    s_mov_b64 s[4:5], 0
6396; GFX8-NEXT:    v_mov_b32_e32 v3, s6
6397; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
6398; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
6399; GFX8-NEXT:    s_waitcnt vmcnt(0)
6400; GFX8-NEXT:    v_mov_b32_e32 v5, v0
6401; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
6402; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
6403; GFX8-NEXT:    v_add_f16_e32 v1, v5, v2
6404; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
6405; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
6406; GFX8-NEXT:    v_mov_b32_e32 v0, v4
6407; GFX8-NEXT:    v_mov_b32_e32 v1, v5
6408; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
6409; GFX8-NEXT:    s_waitcnt vmcnt(0)
6410; GFX8-NEXT:    buffer_wbinvl1
6411; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
6412; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6413; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6414; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
6415; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
6416; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
6417; GFX8-NEXT:    s_setpc_b64 s[30:31]
6418;
6419; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
6420; GFX7:       ; %bb.0:
6421; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6422; GFX7-NEXT:    v_mov_b32_e32 v2, s20
6423; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
6424; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
6425; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
6426; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
6427; GFX7-NEXT:    s_mov_b64 s[4:5], 0
6428; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
6429; GFX7-NEXT:    s_waitcnt vmcnt(0)
6430; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
6431; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
6432; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
6433; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
6434; GFX7-NEXT:    v_mov_b32_e32 v4, s6
6435; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
6436; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
6437; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
6438; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
6439; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v1
6440; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
6441; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
6442; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
6443; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
6444; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
6445; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
6446; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
6447; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
6448; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
6449; GFX7-NEXT:    v_mov_b32_e32 v8, v6
6450; GFX7-NEXT:    v_mov_b32_e32 v7, v5
6451; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
6452; GFX7-NEXT:    s_waitcnt vmcnt(0)
6453; GFX7-NEXT:    buffer_wbinvl1
6454; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
6455; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v7
6456; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
6457; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
6458; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6459; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6460; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
6461; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
6462; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6463; GFX7-NEXT:    s_setpc_b64 s[30:31]
6464;
6465; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
6466; GFX6:       ; %bb.0:
6467; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6468; GFX6-NEXT:    v_mov_b32_e32 v2, s20
6469; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
6470; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
6471; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v0
6472; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
6473; GFX6-NEXT:    s_mov_b64 s[4:5], 0
6474; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v1
6475; GFX6-NEXT:    s_waitcnt vmcnt(0)
6476; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
6477; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
6478; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
6479; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
6480; GFX6-NEXT:    v_mov_b32_e32 v4, s6
6481; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
6482; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
6483; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
6484; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
6485; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v1
6486; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v0
6487; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
6488; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
6489; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
6490; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
6491; GFX6-NEXT:    s_waitcnt expcnt(0)
6492; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
6493; GFX6-NEXT:    v_or_b32_e32 v6, v0, v1
6494; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
6495; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
6496; GFX6-NEXT:    v_mov_b32_e32 v8, v6
6497; GFX6-NEXT:    v_mov_b32_e32 v7, v5
6498; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
6499; GFX6-NEXT:    s_waitcnt vmcnt(0)
6500; GFX6-NEXT:    buffer_wbinvl1
6501; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
6502; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v7
6503; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
6504; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
6505; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6506; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6507; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
6508; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
6509; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
6510; GFX6-NEXT:    s_waitcnt expcnt(0)
6511; GFX6-NEXT:    s_setpc_b64 s[30:31]
6512  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
6513  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
6514  ret <2 x half> %result
6515}
6516
6517define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
6518; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
6519; GFX12:       ; %bb.0:
6520; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6521; GFX12-NEXT:    s_wait_expcnt 0x0
6522; GFX12-NEXT:    s_wait_samplecnt 0x0
6523; GFX12-NEXT:    s_wait_bvhcnt 0x0
6524; GFX12-NEXT:    s_wait_kmcnt 0x0
6525; GFX12-NEXT:    v_mov_b32_e32 v1, s16
6526; GFX12-NEXT:    s_wait_storecnt 0x0
6527; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
6528; GFX12-NEXT:    s_wait_storecnt 0x0
6529; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6530; GFX12-NEXT:    s_setpc_b64 s[30:31]
6531;
6532; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
6533; GFX940:       ; %bb.0:
6534; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6535; GFX940-NEXT:    v_mov_b32_e32 v1, s16
6536; GFX940-NEXT:    buffer_wbl2 sc1
6537; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
6538; GFX940-NEXT:    s_waitcnt vmcnt(0)
6539; GFX940-NEXT:    buffer_inv sc1
6540; GFX940-NEXT:    s_setpc_b64 s[30:31]
6541;
6542; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
6543; GFX11:       ; %bb.0:
6544; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6545; GFX11-NEXT:    v_mov_b32_e32 v1, s16
6546; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
6547; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6548; GFX11-NEXT:    v_mov_b32_e32 v3, s4
6549; GFX11-NEXT:    s_mov_b32 s4, 0
6550; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
6551; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
6552; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
6553; GFX11-NEXT:    s_waitcnt vmcnt(0)
6554; GFX11-NEXT:    v_pk_add_f16 v1, v2, v0
6555; GFX11-NEXT:    v_mov_b32_e32 v5, v2
6556; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
6557; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
6558; GFX11-NEXT:    v_mov_b32_e32 v4, v1
6559; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
6560; GFX11-NEXT:    s_waitcnt vmcnt(0)
6561; GFX11-NEXT:    buffer_gl1_inv
6562; GFX11-NEXT:    buffer_gl0_inv
6563; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
6564; GFX11-NEXT:    v_mov_b32_e32 v2, v4
6565; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
6566; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6567; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
6568; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
6569; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
6570; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6571; GFX11-NEXT:    s_setpc_b64 s[30:31]
6572;
6573; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
6574; GFX10:       ; %bb.0:
6575; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6576; GFX10-NEXT:    v_mov_b32_e32 v1, s20
6577; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
6578; GFX10-NEXT:    v_mov_b32_e32 v3, s4
6579; GFX10-NEXT:    s_mov_b32 s4, 0
6580; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
6581; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
6582; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
6583; GFX10-NEXT:    s_waitcnt vmcnt(0)
6584; GFX10-NEXT:    v_pk_add_f16 v1, v2, v0
6585; GFX10-NEXT:    v_mov_b32_e32 v5, v2
6586; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
6587; GFX10-NEXT:    v_mov_b32_e32 v4, v1
6588; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
6589; GFX10-NEXT:    s_waitcnt vmcnt(0)
6590; GFX10-NEXT:    buffer_gl1_inv
6591; GFX10-NEXT:    buffer_gl0_inv
6592; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
6593; GFX10-NEXT:    v_mov_b32_e32 v2, v4
6594; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
6595; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
6596; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
6597; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
6598; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6599; GFX10-NEXT:    s_setpc_b64 s[30:31]
6600;
6601; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
6602; GFX90A:       ; %bb.0:
6603; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6604; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
6605; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[16:19], 0 offen offset:1024
6606; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6607; GFX90A-NEXT:    buffer_wbinvl1
6608; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6609;
6610; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
6611; GFX908:       ; %bb.0:
6612; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6613; GFX908-NEXT:    v_mov_b32_e32 v1, s20
6614; GFX908-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[16:19], 0 offen offset:1024
6615; GFX908-NEXT:    s_waitcnt vmcnt(0)
6616; GFX908-NEXT:    buffer_wbinvl1
6617; GFX908-NEXT:    s_setpc_b64 s[30:31]
6618;
6619; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
6620; GFX8:       ; %bb.0:
6621; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6622; GFX8-NEXT:    v_mov_b32_e32 v1, s20
6623; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
6624; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
6625; GFX8-NEXT:    s_mov_b64 s[4:5], 0
6626; GFX8-NEXT:    v_mov_b32_e32 v3, s6
6627; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
6628; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
6629; GFX8-NEXT:    s_waitcnt vmcnt(0)
6630; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
6631; GFX8-NEXT:    v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
6632; GFX8-NEXT:    v_add_f16_e32 v4, v2, v0
6633; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
6634; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
6635; GFX8-NEXT:    v_mov_b32_e32 v5, v2
6636; GFX8-NEXT:    v_mov_b32_e32 v4, v1
6637; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
6638; GFX8-NEXT:    s_waitcnt vmcnt(0)
6639; GFX8-NEXT:    buffer_wbinvl1
6640; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
6641; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6642; GFX8-NEXT:    v_mov_b32_e32 v2, v4
6643; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6644; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
6645; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
6646; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
6647; GFX8-NEXT:    s_setpc_b64 s[30:31]
6648;
6649; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
6650; GFX7:       ; %bb.0:
6651; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6652; GFX7-NEXT:    v_mov_b32_e32 v2, s20
6653; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
6654; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
6655; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
6656; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
6657; GFX7-NEXT:    s_mov_b64 s[4:5], 0
6658; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
6659; GFX7-NEXT:    s_waitcnt vmcnt(0)
6660; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
6661; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
6662; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
6663; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
6664; GFX7-NEXT:    v_mov_b32_e32 v2, s6
6665; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
6666; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
6667; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
6668; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
6669; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v4
6670; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
6671; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
6672; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
6673; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
6674; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v5
6675; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
6676; GFX7-NEXT:    v_or_b32_e32 v5, v3, v4
6677; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
6678; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
6679; GFX7-NEXT:    v_mov_b32_e32 v7, v5
6680; GFX7-NEXT:    v_mov_b32_e32 v6, v4
6681; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
6682; GFX7-NEXT:    s_waitcnt vmcnt(0)
6683; GFX7-NEXT:    buffer_wbinvl1
6684; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
6685; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
6686; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
6687; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
6688; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6689; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6690; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
6691; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
6692; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6693; GFX7-NEXT:    s_setpc_b64 s[30:31]
6694;
6695; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
6696; GFX6:       ; %bb.0:
6697; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6698; GFX6-NEXT:    v_mov_b32_e32 v2, s20
6699; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
6700; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
6701; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v0
6702; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
6703; GFX6-NEXT:    s_mov_b64 s[4:5], 0
6704; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v1
6705; GFX6-NEXT:    s_waitcnt vmcnt(0)
6706; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
6707; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
6708; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
6709; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
6710; GFX6-NEXT:    v_mov_b32_e32 v2, s6
6711; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
6712; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
6713; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
6714; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
6715; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v4
6716; GFX6-NEXT:    s_waitcnt expcnt(0)
6717; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
6718; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
6719; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
6720; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
6721; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v5
6722; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
6723; GFX6-NEXT:    v_or_b32_e32 v5, v3, v4
6724; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
6725; GFX6-NEXT:    v_or_b32_e32 v4, v6, v3
6726; GFX6-NEXT:    v_mov_b32_e32 v7, v5
6727; GFX6-NEXT:    v_mov_b32_e32 v6, v4
6728; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
6729; GFX6-NEXT:    s_waitcnt vmcnt(0)
6730; GFX6-NEXT:    buffer_wbinvl1
6731; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
6732; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
6733; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
6734; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
6735; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6736; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6737; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
6738; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
6739; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
6740; GFX6-NEXT:    s_waitcnt expcnt(0)
6741; GFX6-NEXT:    s_setpc_b64 s[30:31]
6742  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
6743  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
6744  ret void
6745}
6746
6747define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
6748; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6749; GFX12:       ; %bb.0:
6750; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6751; GFX12-NEXT:    s_wait_expcnt 0x0
6752; GFX12-NEXT:    s_wait_samplecnt 0x0
6753; GFX12-NEXT:    s_wait_bvhcnt 0x0
6754; GFX12-NEXT:    s_wait_kmcnt 0x0
6755; GFX12-NEXT:    s_mov_b32 s1, exec_lo
6756; GFX12-NEXT:    s_wait_storecnt 0x0
6757; GFX12-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
6758; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
6759; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
6760; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
6761; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
6762; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
6763; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
6764; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
6765; GFX12-NEXT:    s_wait_alu 0xfffe
6766; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6767; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
6768; GFX12-NEXT:    s_wait_alu 0xfffe
6769; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
6770; GFX12-NEXT:    s_wait_loadcnt 0x0
6771; GFX12-NEXT:    buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
6772; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
6773; GFX12-NEXT:    ; implicit-def: $vgpr4
6774; GFX12-NEXT:    s_wait_alu 0xfffe
6775; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
6776; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
6777; GFX12-NEXT:  ; %bb.2:
6778; GFX12-NEXT:    s_mov_b32 exec_lo, s1
6779; GFX12-NEXT:    s_wait_loadcnt 0x0
6780; GFX12-NEXT:    v_mov_b32_e32 v0, v5
6781; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6782; GFX12-NEXT:    s_wait_alu 0xfffe
6783; GFX12-NEXT:    s_setpc_b64 s[30:31]
6784;
6785; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6786; GFX940:       ; %bb.0:
6787; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6788; GFX940-NEXT:    s_mov_b64 s[2:3], exec
6789; GFX940-NEXT:    buffer_wbl2 sc1
6790; GFX940-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
6791; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
6792; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
6793; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
6794; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
6795; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
6796; GFX940-NEXT:    s_nop 0
6797; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
6798; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
6799; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
6800; GFX940-NEXT:    s_waitcnt vmcnt(0)
6801; GFX940-NEXT:    buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0
6802; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
6803; GFX940-NEXT:    ; implicit-def: $vgpr4
6804; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
6805; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
6806; GFX940-NEXT:  ; %bb.2:
6807; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
6808; GFX940-NEXT:    s_waitcnt vmcnt(0)
6809; GFX940-NEXT:    v_mov_b32_e32 v0, v5
6810; GFX940-NEXT:    buffer_inv sc1
6811; GFX940-NEXT:    s_setpc_b64 s[30:31]
6812;
6813; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6814; GFX11:       ; %bb.0:
6815; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6816; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
6817; GFX11-NEXT:    s_mov_b32 s1, 0
6818; GFX11-NEXT:    s_mov_b32 s2, exec_lo
6819; GFX11-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
6820; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
6821; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
6822; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
6823; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
6824; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
6825; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
6826; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
6827; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
6828; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
6829; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
6830; GFX11-NEXT:    buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
6831; GFX11-NEXT:    ; implicit-def: $vgpr4
6832; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
6833; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
6834; GFX11-NEXT:  ; %bb.2:
6835; GFX11-NEXT:    s_mov_b32 exec_lo, s2
6836; GFX11-NEXT:    .p2align 6
6837; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.start
6838; GFX11-NEXT:    ; =>This Loop Header: Depth=1
6839; GFX11-NEXT:    ; Child Loop BB21_4 Depth 2
6840; GFX11-NEXT:    s_waitcnt vmcnt(0)
6841; GFX11-NEXT:    v_pk_add_f16 v7, v8, v5
6842; GFX11-NEXT:    s_mov_b32 s2, exec_lo
6843; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
6844; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6845; GFX11-NEXT:    v_mov_b32_e32 v6, v7
6846; GFX11-NEXT:    v_mov_b32_e32 v7, v8
6847; GFX11-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
6848; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
6849; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
6850; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
6851; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
6852; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
6853; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
6854; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
6855; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
6856; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
6857; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
6858; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
6859; GFX11-NEXT:    s_waitcnt vmcnt(0)
6860; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
6861; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
6862; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
6863; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
6864; GFX11-NEXT:    s_mov_b32 exec_lo, s2
6865; GFX11-NEXT:    s_waitcnt vmcnt(0)
6866; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
6867; GFX11-NEXT:    v_mov_b32_e32 v8, v6
6868; GFX11-NEXT:    buffer_gl1_inv
6869; GFX11-NEXT:    buffer_gl0_inv
6870; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
6871; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6872; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
6873; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
6874; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
6875; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
6876; GFX11-NEXT:    v_mov_b32_e32 v0, v6
6877; GFX11-NEXT:    s_setpc_b64 s[30:31]
6878;
6879; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6880; GFX10:       ; %bb.0:
6881; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6882; GFX10-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
6883; GFX10-NEXT:    s_mov_b32 s5, 0
6884; GFX10-NEXT:    s_mov_b32 s6, exec_lo
6885; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
6886; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
6887; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
6888; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
6889; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
6890; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
6891; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
6892; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
6893; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
6894; GFX10-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
6895; GFX10-NEXT:    ; implicit-def: $vgpr4
6896; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
6897; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
6898; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
6899; GFX10-NEXT:  ; %bb.2:
6900; GFX10-NEXT:    s_mov_b32 exec_lo, s6
6901; GFX10-NEXT:  .LBB21_3: ; %atomicrmw.start
6902; GFX10-NEXT:    ; =>This Loop Header: Depth=1
6903; GFX10-NEXT:    ; Child Loop BB21_4 Depth 2
6904; GFX10-NEXT:    s_waitcnt vmcnt(0)
6905; GFX10-NEXT:    v_pk_add_f16 v7, v8, v5
6906; GFX10-NEXT:    s_mov_b32 s6, exec_lo
6907; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
6908; GFX10-NEXT:    v_mov_b32_e32 v6, v7
6909; GFX10-NEXT:    v_mov_b32_e32 v7, v8
6910; GFX10-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
6911; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
6912; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
6913; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
6914; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
6915; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
6916; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
6917; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
6918; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
6919; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
6920; GFX10-NEXT:    s_waitcnt vmcnt(0)
6921; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
6922; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
6923; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
6924; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
6925; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
6926; GFX10-NEXT:    s_mov_b32 exec_lo, s6
6927; GFX10-NEXT:    s_waitcnt vmcnt(0)
6928; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
6929; GFX10-NEXT:    v_mov_b32_e32 v8, v6
6930; GFX10-NEXT:    buffer_gl1_inv
6931; GFX10-NEXT:    buffer_gl0_inv
6932; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
6933; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
6934; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
6935; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
6936; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
6937; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
6938; GFX10-NEXT:    v_mov_b32_e32 v0, v6
6939; GFX10-NEXT:    s_setpc_b64 s[30:31]
6940;
6941; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6942; GFX90A:       ; %bb.0:
6943; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6944; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
6945; GFX90A-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
6946; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
6947; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
6948; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
6949; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
6950; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6951; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6952; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6953; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6954; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6955; GFX90A-NEXT:    buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc
6956; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
6957; GFX90A-NEXT:    ; implicit-def: $vgpr4
6958; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
6959; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
6960; GFX90A-NEXT:  ; %bb.2:
6961; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
6962; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6963; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
6964; GFX90A-NEXT:    buffer_wbinvl1
6965; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6966;
6967; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6968; GFX908:       ; %bb.0:
6969; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6970; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
6971; GFX908-NEXT:    s_mov_b64 s[6:7], exec
6972; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
6973; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
6974; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
6975; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
6976; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
6977; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6978; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6979; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6980; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6981; GFX908-NEXT:    s_nop 0
6982; GFX908-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
6983; GFX908-NEXT:    ; implicit-def: $vgpr4
6984; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
6985; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
6986; GFX908-NEXT:  ; %bb.2:
6987; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
6988; GFX908-NEXT:    s_mov_b64 s[6:7], 0
6989; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.start
6990; GFX908-NEXT:    ; =>This Loop Header: Depth=1
6991; GFX908-NEXT:    ; Child Loop BB21_4 Depth 2
6992; GFX908-NEXT:    s_waitcnt vmcnt(0)
6993; GFX908-NEXT:    v_pk_add_f16 v7, v8, v5
6994; GFX908-NEXT:    v_mov_b32_e32 v6, v7
6995; GFX908-NEXT:    s_mov_b64 s[12:13], exec
6996; GFX908-NEXT:    v_mov_b32_e32 v7, v8
6997; GFX908-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
6998; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
6999; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
7000; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
7001; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
7002; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
7003; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7004; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7005; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
7006; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
7007; GFX908-NEXT:    s_waitcnt vmcnt(0)
7008; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
7009; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
7010; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
7011; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
7012; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
7013; GFX908-NEXT:    s_waitcnt vmcnt(0)
7014; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
7015; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7016; GFX908-NEXT:    v_mov_b32_e32 v8, v6
7017; GFX908-NEXT:    buffer_wbinvl1
7018; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7019; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
7020; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
7021; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
7022; GFX908-NEXT:    v_mov_b32_e32 v0, v6
7023; GFX908-NEXT:    s_setpc_b64 s[30:31]
7024;
7025; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
7026; GFX8:       ; %bb.0:
7027; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7028; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x400, v4
7029; GFX8-NEXT:    s_mov_b64 s[6:7], exec
7030; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
7031; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
7032; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
7033; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
7034; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
7035; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7036; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7037; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
7038; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
7039; GFX8-NEXT:    s_nop 0
7040; GFX8-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
7041; GFX8-NEXT:    ; implicit-def: $vgpr4
7042; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
7043; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
7044; GFX8-NEXT:  ; %bb.2:
7045; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
7046; GFX8-NEXT:    s_mov_b64 s[6:7], 0
7047; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.start
7048; GFX8-NEXT:    ; =>This Loop Header: Depth=1
7049; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
7050; GFX8-NEXT:    s_waitcnt vmcnt(0)
7051; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v8
7052; GFX8-NEXT:    v_add_f16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
7053; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
7054; GFX8-NEXT:    v_add_f16_e32 v6, v8, v5
7055; GFX8-NEXT:    v_or_b32_e32 v7, v6, v4
7056; GFX8-NEXT:    v_mov_b32_e32 v6, v7
7057; GFX8-NEXT:    s_mov_b64 s[12:13], exec
7058; GFX8-NEXT:    v_mov_b32_e32 v7, v8
7059; GFX8-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
7060; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
7061; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
7062; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
7063; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
7064; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
7065; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7066; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7067; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
7068; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
7069; GFX8-NEXT:    s_waitcnt vmcnt(0)
7070; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
7071; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
7072; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
7073; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
7074; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
7075; GFX8-NEXT:    s_waitcnt vmcnt(0)
7076; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
7077; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7078; GFX8-NEXT:    v_mov_b32_e32 v8, v6
7079; GFX8-NEXT:    buffer_wbinvl1
7080; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7081; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
7082; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
7083; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
7084; GFX8-NEXT:    v_mov_b32_e32 v0, v6
7085; GFX8-NEXT:    s_setpc_b64 s[30:31]
7086;
7087; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
7088; GFX7:       ; %bb.0:
7089; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7090; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
7091; GFX7-NEXT:    s_mov_b64 s[6:7], exec
7092; GFX7-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
7093; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
7094; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
7095; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
7096; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
7097; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7098; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7099; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
7100; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
7101; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
7102; GFX7-NEXT:    ; implicit-def: $vgpr4
7103; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
7104; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
7105; GFX7-NEXT:  ; %bb.2:
7106; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
7107; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
7108; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v5
7109; GFX7-NEXT:    s_waitcnt vmcnt(0)
7110; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
7111; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
7112; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
7113; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v6
7114; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v8
7115; GFX7-NEXT:    s_mov_b64 s[6:7], 0
7116; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.start
7117; GFX7-NEXT:    ; =>This Loop Header: Depth=1
7118; GFX7-NEXT:    ; Child Loop BB21_4 Depth 2
7119; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
7120; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
7121; GFX7-NEXT:    s_mov_b64 s[12:13], exec
7122; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
7123; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
7124; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
7125; GFX7-NEXT:    v_add_f32_e32 v6, v6, v10
7126; GFX7-NEXT:    v_add_f32_e32 v7, v7, v11
7127; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
7128; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
7129; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
7130; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
7131; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
7132; GFX7-NEXT:    v_mov_b32_e32 v8, v6
7133; GFX7-NEXT:    v_mov_b32_e32 v7, v5
7134; GFX7-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
7135; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
7136; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
7137; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
7138; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
7139; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
7140; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7141; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7142; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
7143; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
7144; GFX7-NEXT:    s_waitcnt vmcnt(0)
7145; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
7146; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
7147; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
7148; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
7149; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
7150; GFX7-NEXT:    s_waitcnt vmcnt(0)
7151; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
7152; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
7153; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
7154; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
7155; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7156; GFX7-NEXT:    buffer_wbinvl1
7157; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7158; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
7159; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
7160; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
7161; GFX7-NEXT:    v_mov_b32_e32 v0, v4
7162; GFX7-NEXT:    v_mov_b32_e32 v1, v5
7163; GFX7-NEXT:    s_setpc_b64 s[30:31]
7164;
7165; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
7166; GFX6:       ; %bb.0:
7167; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7168; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
7169; GFX6-NEXT:    s_mov_b64 s[6:7], exec
7170; GFX6-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
7171; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
7172; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
7173; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
7174; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
7175; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7176; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7177; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
7178; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
7179; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
7180; GFX6-NEXT:    ; implicit-def: $vgpr4
7181; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
7182; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
7183; GFX6-NEXT:  ; %bb.2:
7184; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
7185; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
7186; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v5
7187; GFX6-NEXT:    s_waitcnt vmcnt(0)
7188; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
7189; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
7190; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
7191; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v6
7192; GFX6-NEXT:    v_cvt_f32_f16_e32 v11, v8
7193; GFX6-NEXT:    s_mov_b64 s[6:7], 0
7194; GFX6-NEXT:  .LBB21_3: ; %atomicrmw.start
7195; GFX6-NEXT:    ; =>This Loop Header: Depth=1
7196; GFX6-NEXT:    ; Child Loop BB21_4 Depth 2
7197; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
7198; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
7199; GFX6-NEXT:    s_mov_b64 s[12:13], exec
7200; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
7201; GFX6-NEXT:    s_waitcnt expcnt(0)
7202; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
7203; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
7204; GFX6-NEXT:    v_add_f32_e32 v6, v6, v10
7205; GFX6-NEXT:    v_add_f32_e32 v7, v7, v11
7206; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
7207; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
7208; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
7209; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
7210; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
7211; GFX6-NEXT:    v_mov_b32_e32 v8, v6
7212; GFX6-NEXT:    v_mov_b32_e32 v7, v5
7213; GFX6-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
7214; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
7215; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
7216; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
7217; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
7218; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
7219; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7220; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7221; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
7222; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
7223; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
7224; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
7225; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
7226; GFX6-NEXT:    s_cbranch_execnz .LBB21_4
7227; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
7228; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
7229; GFX6-NEXT:    s_waitcnt vmcnt(0)
7230; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
7231; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
7232; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
7233; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
7234; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7235; GFX6-NEXT:    buffer_wbinvl1
7236; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7237; GFX6-NEXT:    s_cbranch_execnz .LBB21_3
7238; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
7239; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
7240; GFX6-NEXT:    v_mov_b32_e32 v0, v4
7241; GFX6-NEXT:    v_mov_b32_e32 v1, v5
7242; GFX6-NEXT:    s_waitcnt expcnt(0)
7243; GFX6-NEXT:    s_setpc_b64 s[30:31]
7244  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
7245  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
7246  ret <2 x half> %result
7247}
7248
7249define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) {
7250; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
7251; GFX12:       ; %bb.0:
7252; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7253; GFX12-NEXT:    s_wait_expcnt 0x0
7254; GFX12-NEXT:    s_wait_samplecnt 0x0
7255; GFX12-NEXT:    s_wait_bvhcnt 0x0
7256; GFX12-NEXT:    s_wait_kmcnt 0x0
7257; GFX12-NEXT:    v_mov_b32_e32 v1, s16
7258; GFX12-NEXT:    s_wait_storecnt 0x0
7259; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
7260; GFX12-NEXT:    s_wait_loadcnt 0x0
7261; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7262; GFX12-NEXT:    s_setpc_b64 s[30:31]
7263;
7264; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
7265; GFX940:       ; %bb.0:
7266; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7267; GFX940-NEXT:    v_mov_b32_e32 v1, s16
7268; GFX940-NEXT:    buffer_wbl2 sc1
7269; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
7270; GFX940-NEXT:    s_waitcnt vmcnt(0)
7271; GFX940-NEXT:    buffer_inv sc1
7272; GFX940-NEXT:    s_setpc_b64 s[30:31]
7273;
7274; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
7275; GFX11:       ; %bb.0:
7276; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7277; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
7278; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7279; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
7280; GFX11-NEXT:    v_mov_b32_e32 v0, s16
7281; GFX11-NEXT:    s_mov_b32 s4, 0
7282; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
7283; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
7284; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7285; GFX11-NEXT:    s_waitcnt vmcnt(0)
7286; GFX11-NEXT:    v_mov_b32_e32 v5, v0
7287; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7288; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7289; GFX11-NEXT:    v_pk_add_f16 v4, v5, v2
7290; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
7291; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
7292; GFX11-NEXT:    s_waitcnt vmcnt(0)
7293; GFX11-NEXT:    buffer_gl1_inv
7294; GFX11-NEXT:    buffer_gl0_inv
7295; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
7296; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
7297; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7298; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
7299; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
7300; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
7301; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7302; GFX11-NEXT:    s_setpc_b64 s[30:31]
7303;
7304; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
7305; GFX10:       ; %bb.0:
7306; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7307; GFX10-NEXT:    v_mov_b32_e32 v2, v0
7308; GFX10-NEXT:    v_mov_b32_e32 v0, s20
7309; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
7310; GFX10-NEXT:    v_mov_b32_e32 v3, s4
7311; GFX10-NEXT:    s_mov_b32 s4, 0
7312; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
7313; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
7314; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7315; GFX10-NEXT:    s_waitcnt vmcnt(0)
7316; GFX10-NEXT:    v_mov_b32_e32 v5, v0
7317; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7318; GFX10-NEXT:    v_pk_add_f16 v4, v5, v2
7319; GFX10-NEXT:    v_mov_b32_e32 v0, v4
7320; GFX10-NEXT:    v_mov_b32_e32 v1, v5
7321; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
7322; GFX10-NEXT:    s_waitcnt vmcnt(0)
7323; GFX10-NEXT:    buffer_gl1_inv
7324; GFX10-NEXT:    buffer_gl0_inv
7325; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
7326; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
7327; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
7328; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
7329; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
7330; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7331; GFX10-NEXT:    s_setpc_b64 s[30:31]
7332;
7333; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
7334; GFX90A:       ; %bb.0:
7335; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7336; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
7337; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
7338; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
7339; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
7340; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
7341; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
7342; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
7343; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
7344; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7345; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
7346; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
7347; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
7348; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
7349; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7350; GFX90A-NEXT:    buffer_wbinvl1
7351; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
7352; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7353; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7354; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
7355; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
7356; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7357; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7358;
7359; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
7360; GFX908:       ; %bb.0:
7361; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7362; GFX908-NEXT:    v_mov_b32_e32 v2, v0
7363; GFX908-NEXT:    v_mov_b32_e32 v0, s20
7364; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
7365; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
7366; GFX908-NEXT:    s_mov_b64 s[4:5], 0
7367; GFX908-NEXT:    v_mov_b32_e32 v3, s6
7368; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
7369; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7370; GFX908-NEXT:    s_waitcnt vmcnt(0)
7371; GFX908-NEXT:    v_mov_b32_e32 v5, v0
7372; GFX908-NEXT:    v_pk_add_f16 v4, v5, v2
7373; GFX908-NEXT:    v_mov_b32_e32 v0, v4
7374; GFX908-NEXT:    v_mov_b32_e32 v1, v5
7375; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
7376; GFX908-NEXT:    s_waitcnt vmcnt(0)
7377; GFX908-NEXT:    buffer_wbinvl1
7378; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
7379; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7380; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7381; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
7382; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
7383; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7384; GFX908-NEXT:    s_setpc_b64 s[30:31]
7385;
7386; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
7387; GFX8:       ; %bb.0:
7388; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7389; GFX8-NEXT:    v_mov_b32_e32 v2, v0
7390; GFX8-NEXT:    v_mov_b32_e32 v0, s20
7391; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
7392; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
7393; GFX8-NEXT:    s_mov_b64 s[4:5], 0
7394; GFX8-NEXT:    v_mov_b32_e32 v3, s6
7395; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
7396; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
7397; GFX8-NEXT:    s_waitcnt vmcnt(0)
7398; GFX8-NEXT:    v_mov_b32_e32 v5, v0
7399; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
7400; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
7401; GFX8-NEXT:    v_add_f16_e32 v1, v5, v2
7402; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
7403; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
7404; GFX8-NEXT:    v_mov_b32_e32 v0, v4
7405; GFX8-NEXT:    v_mov_b32_e32 v1, v5
7406; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
7407; GFX8-NEXT:    s_waitcnt vmcnt(0)
7408; GFX8-NEXT:    buffer_wbinvl1
7409; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
7410; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7411; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7412; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
7413; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
7414; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7415; GFX8-NEXT:    s_setpc_b64 s[30:31]
7416;
7417; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
7418; GFX7:       ; %bb.0:
7419; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7420; GFX7-NEXT:    v_mov_b32_e32 v2, s20
7421; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
7422; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
7423; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
7424; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
7425; GFX7-NEXT:    s_mov_b64 s[4:5], 0
7426; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
7427; GFX7-NEXT:    s_waitcnt vmcnt(0)
7428; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
7429; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
7430; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
7431; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
7432; GFX7-NEXT:    v_mov_b32_e32 v4, s6
7433; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
7434; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
7435; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
7436; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
7437; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v1
7438; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
7439; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
7440; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
7441; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
7442; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
7443; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
7444; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
7445; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
7446; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
7447; GFX7-NEXT:    v_mov_b32_e32 v8, v6
7448; GFX7-NEXT:    v_mov_b32_e32 v7, v5
7449; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
7450; GFX7-NEXT:    s_waitcnt vmcnt(0)
7451; GFX7-NEXT:    buffer_wbinvl1
7452; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
7453; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v7
7454; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
7455; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
7456; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7457; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7458; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
7459; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
7460; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7461; GFX7-NEXT:    s_setpc_b64 s[30:31]
7462;
7463; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
7464; GFX6:       ; %bb.0:
7465; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7466; GFX6-NEXT:    v_mov_b32_e32 v2, s20
7467; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
7468; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
7469; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v0
7470; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
7471; GFX6-NEXT:    s_mov_b64 s[4:5], 0
7472; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v1
7473; GFX6-NEXT:    s_waitcnt vmcnt(0)
7474; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
7475; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
7476; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
7477; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
7478; GFX6-NEXT:    v_mov_b32_e32 v4, s6
7479; GFX6-NEXT:  .LBB22_1: ; %atomicrmw.start
7480; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
7481; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
7482; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
7483; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v1
7484; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v0
7485; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
7486; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
7487; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
7488; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
7489; GFX6-NEXT:    s_waitcnt expcnt(0)
7490; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
7491; GFX6-NEXT:    v_or_b32_e32 v6, v0, v1
7492; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
7493; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
7494; GFX6-NEXT:    v_mov_b32_e32 v8, v6
7495; GFX6-NEXT:    v_mov_b32_e32 v7, v5
7496; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
7497; GFX6-NEXT:    s_waitcnt vmcnt(0)
7498; GFX6-NEXT:    buffer_wbinvl1
7499; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
7500; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v7
7501; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
7502; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
7503; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7504; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7505; GFX6-NEXT:    s_cbranch_execnz .LBB22_1
7506; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
7507; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
7508; GFX6-NEXT:    s_waitcnt expcnt(0)
7509; GFX6-NEXT:    s_setpc_b64 s[30:31]
7510  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
7511  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
7512  ret <2 x half> %result
7513}
7514
7515define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) {
7516; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
7517; GFX12:       ; %bb.0:
7518; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7519; GFX12-NEXT:    s_wait_expcnt 0x0
7520; GFX12-NEXT:    s_wait_samplecnt 0x0
7521; GFX12-NEXT:    s_wait_bvhcnt 0x0
7522; GFX12-NEXT:    s_wait_kmcnt 0x0
7523; GFX12-NEXT:    v_mov_b32_e32 v1, s16
7524; GFX12-NEXT:    s_wait_storecnt 0x0
7525; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
7526; GFX12-NEXT:    s_wait_storecnt 0x0
7527; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7528; GFX12-NEXT:    s_setpc_b64 s[30:31]
7529;
7530; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
7531; GFX940:       ; %bb.0:
7532; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7533; GFX940-NEXT:    v_mov_b32_e32 v1, s16
7534; GFX940-NEXT:    buffer_wbl2 sc1
7535; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
7536; GFX940-NEXT:    s_waitcnt vmcnt(0)
7537; GFX940-NEXT:    buffer_inv sc1
7538; GFX940-NEXT:    s_setpc_b64 s[30:31]
7539;
7540; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
7541; GFX11:       ; %bb.0:
7542; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7543; GFX11-NEXT:    v_mov_b32_e32 v1, s16
7544; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
7545; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7546; GFX11-NEXT:    v_mov_b32_e32 v3, s4
7547; GFX11-NEXT:    s_mov_b32 s4, 0
7548; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
7549; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
7550; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7551; GFX11-NEXT:    s_waitcnt vmcnt(0)
7552; GFX11-NEXT:    v_pk_add_f16 v1, v2, v0
7553; GFX11-NEXT:    v_mov_b32_e32 v5, v2
7554; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7555; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
7556; GFX11-NEXT:    v_mov_b32_e32 v4, v1
7557; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
7558; GFX11-NEXT:    s_waitcnt vmcnt(0)
7559; GFX11-NEXT:    buffer_gl1_inv
7560; GFX11-NEXT:    buffer_gl0_inv
7561; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
7562; GFX11-NEXT:    v_mov_b32_e32 v2, v4
7563; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
7564; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7565; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
7566; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
7567; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
7568; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7569; GFX11-NEXT:    s_setpc_b64 s[30:31]
7570;
7571; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
7572; GFX10:       ; %bb.0:
7573; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7574; GFX10-NEXT:    v_mov_b32_e32 v1, s20
7575; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
7576; GFX10-NEXT:    v_mov_b32_e32 v3, s4
7577; GFX10-NEXT:    s_mov_b32 s4, 0
7578; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
7579; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
7580; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7581; GFX10-NEXT:    s_waitcnt vmcnt(0)
7582; GFX10-NEXT:    v_pk_add_f16 v1, v2, v0
7583; GFX10-NEXT:    v_mov_b32_e32 v5, v2
7584; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7585; GFX10-NEXT:    v_mov_b32_e32 v4, v1
7586; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
7587; GFX10-NEXT:    s_waitcnt vmcnt(0)
7588; GFX10-NEXT:    buffer_gl1_inv
7589; GFX10-NEXT:    buffer_gl0_inv
7590; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
7591; GFX10-NEXT:    v_mov_b32_e32 v2, v4
7592; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
7593; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
7594; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
7595; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
7596; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7597; GFX10-NEXT:    s_setpc_b64 s[30:31]
7598;
7599; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
7600; GFX90A:       ; %bb.0:
7601; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7602; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
7603; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
7604; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
7605; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
7606; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
7607; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
7608; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
7609; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7610; GFX90A-NEXT:    v_pk_add_f16 v2, v3, v0
7611; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
7612; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
7613; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7614; GFX90A-NEXT:    buffer_wbinvl1
7615; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
7616; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7617; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
7618; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7619; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
7620; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
7621; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7622; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7623;
7624; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
7625; GFX908:       ; %bb.0:
7626; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7627; GFX908-NEXT:    v_mov_b32_e32 v1, s20
7628; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
7629; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
7630; GFX908-NEXT:    s_mov_b64 s[4:5], 0
7631; GFX908-NEXT:    v_mov_b32_e32 v3, s6
7632; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
7633; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7634; GFX908-NEXT:    s_waitcnt vmcnt(0)
7635; GFX908-NEXT:    v_pk_add_f16 v1, v2, v0
7636; GFX908-NEXT:    v_mov_b32_e32 v5, v2
7637; GFX908-NEXT:    v_mov_b32_e32 v4, v1
7638; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
7639; GFX908-NEXT:    s_waitcnt vmcnt(0)
7640; GFX908-NEXT:    buffer_wbinvl1
7641; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
7642; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7643; GFX908-NEXT:    v_mov_b32_e32 v2, v4
7644; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7645; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
7646; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
7647; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7648; GFX908-NEXT:    s_setpc_b64 s[30:31]
7649;
7650; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
7651; GFX8:       ; %bb.0:
7652; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7653; GFX8-NEXT:    v_mov_b32_e32 v1, s20
7654; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
7655; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
7656; GFX8-NEXT:    s_mov_b64 s[4:5], 0
7657; GFX8-NEXT:    v_mov_b32_e32 v3, s6
7658; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
7659; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
7660; GFX8-NEXT:    s_waitcnt vmcnt(0)
7661; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
7662; GFX8-NEXT:    v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
7663; GFX8-NEXT:    v_add_f16_e32 v4, v2, v0
7664; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
7665; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
7666; GFX8-NEXT:    v_mov_b32_e32 v5, v2
7667; GFX8-NEXT:    v_mov_b32_e32 v4, v1
7668; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
7669; GFX8-NEXT:    s_waitcnt vmcnt(0)
7670; GFX8-NEXT:    buffer_wbinvl1
7671; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
7672; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7673; GFX8-NEXT:    v_mov_b32_e32 v2, v4
7674; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7675; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
7676; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
7677; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7678; GFX8-NEXT:    s_setpc_b64 s[30:31]
7679;
7680; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
7681; GFX7:       ; %bb.0:
7682; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7683; GFX7-NEXT:    v_mov_b32_e32 v2, s20
7684; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
7685; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
7686; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
7687; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
7688; GFX7-NEXT:    s_mov_b64 s[4:5], 0
7689; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
7690; GFX7-NEXT:    s_waitcnt vmcnt(0)
7691; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
7692; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
7693; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
7694; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
7695; GFX7-NEXT:    v_mov_b32_e32 v2, s6
7696; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
7697; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
7698; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
7699; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
7700; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v4
7701; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
7702; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
7703; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
7704; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
7705; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v5
7706; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
7707; GFX7-NEXT:    v_or_b32_e32 v5, v3, v4
7708; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
7709; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
7710; GFX7-NEXT:    v_mov_b32_e32 v7, v5
7711; GFX7-NEXT:    v_mov_b32_e32 v6, v4
7712; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
7713; GFX7-NEXT:    s_waitcnt vmcnt(0)
7714; GFX7-NEXT:    buffer_wbinvl1
7715; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
7716; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
7717; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
7718; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
7719; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7720; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7721; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
7722; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
7723; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7724; GFX7-NEXT:    s_setpc_b64 s[30:31]
7725;
7726; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
7727; GFX6:       ; %bb.0:
7728; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7729; GFX6-NEXT:    v_mov_b32_e32 v2, s20
7730; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
7731; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
7732; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v0
7733; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
7734; GFX6-NEXT:    s_mov_b64 s[4:5], 0
7735; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v1
7736; GFX6-NEXT:    s_waitcnt vmcnt(0)
7737; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
7738; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
7739; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
7740; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
7741; GFX6-NEXT:    v_mov_b32_e32 v2, s6
7742; GFX6-NEXT:  .LBB23_1: ; %atomicrmw.start
7743; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
7744; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
7745; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
7746; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v4
7747; GFX6-NEXT:    s_waitcnt expcnt(0)
7748; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
7749; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
7750; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
7751; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
7752; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v5
7753; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
7754; GFX6-NEXT:    v_or_b32_e32 v5, v3, v4
7755; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
7756; GFX6-NEXT:    v_or_b32_e32 v4, v6, v3
7757; GFX6-NEXT:    v_mov_b32_e32 v7, v5
7758; GFX6-NEXT:    v_mov_b32_e32 v6, v4
7759; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
7760; GFX6-NEXT:    s_waitcnt vmcnt(0)
7761; GFX6-NEXT:    buffer_wbinvl1
7762; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
7763; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
7764; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
7765; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
7766; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7767; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7768; GFX6-NEXT:    s_cbranch_execnz .LBB23_1
7769; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
7770; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
7771; GFX6-NEXT:    s_waitcnt expcnt(0)
7772; GFX6-NEXT:    s_setpc_b64 s[30:31]
7773  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
7774  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
7775  ret void
7776}
7777
7778define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
7779; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
7780; GFX12:       ; %bb.0:
7781; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7782; GFX12-NEXT:    s_wait_expcnt 0x0
7783; GFX12-NEXT:    s_wait_samplecnt 0x0
7784; GFX12-NEXT:    s_wait_bvhcnt 0x0
7785; GFX12-NEXT:    s_wait_kmcnt 0x0
7786; GFX12-NEXT:    v_mov_b32_e32 v1, s16
7787; GFX12-NEXT:    s_wait_storecnt 0x0
7788; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
7789; GFX12-NEXT:    s_wait_loadcnt 0x0
7790; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7791; GFX12-NEXT:    s_setpc_b64 s[30:31]
7792;
7793; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
7794; GFX940:       ; %bb.0:
7795; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7796; GFX940-NEXT:    v_mov_b32_e32 v1, s16
7797; GFX940-NEXT:    buffer_wbl2 sc1
7798; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
7799; GFX940-NEXT:    s_waitcnt vmcnt(0)
7800; GFX940-NEXT:    buffer_inv sc1
7801; GFX940-NEXT:    s_setpc_b64 s[30:31]
7802;
7803; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
7804; GFX11:       ; %bb.0:
7805; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7806; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
7807; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7808; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
7809; GFX11-NEXT:    v_mov_b32_e32 v0, s16
7810; GFX11-NEXT:    s_mov_b32 s4, 0
7811; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
7812; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
7813; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7814; GFX11-NEXT:    s_waitcnt vmcnt(0)
7815; GFX11-NEXT:    v_mov_b32_e32 v5, v0
7816; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7817; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7818; GFX11-NEXT:    v_pk_add_f16 v4, v5, v2
7819; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
7820; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
7821; GFX11-NEXT:    s_waitcnt vmcnt(0)
7822; GFX11-NEXT:    buffer_gl1_inv
7823; GFX11-NEXT:    buffer_gl0_inv
7824; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
7825; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
7826; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7827; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
7828; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
7829; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
7830; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7831; GFX11-NEXT:    s_setpc_b64 s[30:31]
7832;
7833; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
7834; GFX10:       ; %bb.0:
7835; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7836; GFX10-NEXT:    v_mov_b32_e32 v2, v0
7837; GFX10-NEXT:    v_mov_b32_e32 v0, s20
7838; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
7839; GFX10-NEXT:    v_mov_b32_e32 v3, s4
7840; GFX10-NEXT:    s_mov_b32 s4, 0
7841; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
7842; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
7843; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7844; GFX10-NEXT:    s_waitcnt vmcnt(0)
7845; GFX10-NEXT:    v_mov_b32_e32 v5, v0
7846; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7847; GFX10-NEXT:    v_pk_add_f16 v4, v5, v2
7848; GFX10-NEXT:    v_mov_b32_e32 v0, v4
7849; GFX10-NEXT:    v_mov_b32_e32 v1, v5
7850; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
7851; GFX10-NEXT:    s_waitcnt vmcnt(0)
7852; GFX10-NEXT:    buffer_gl1_inv
7853; GFX10-NEXT:    buffer_gl0_inv
7854; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
7855; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
7856; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
7857; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
7858; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
7859; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7860; GFX10-NEXT:    s_setpc_b64 s[30:31]
7861;
7862; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
7863; GFX90A:       ; %bb.0:
7864; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7865; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
7866; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
7867; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
7868; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
7869; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
7870; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
7871; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
7872; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
7873; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7874; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
7875; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
7876; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
7877; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
7878; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7879; GFX90A-NEXT:    buffer_wbinvl1
7880; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
7881; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7882; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7883; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
7884; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
7885; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7886; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7887;
7888; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
7889; GFX908:       ; %bb.0:
7890; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7891; GFX908-NEXT:    v_mov_b32_e32 v2, v0
7892; GFX908-NEXT:    v_mov_b32_e32 v0, s20
7893; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
7894; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
7895; GFX908-NEXT:    s_mov_b64 s[4:5], 0
7896; GFX908-NEXT:    v_mov_b32_e32 v3, s6
7897; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
7898; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7899; GFX908-NEXT:    s_waitcnt vmcnt(0)
7900; GFX908-NEXT:    v_mov_b32_e32 v5, v0
7901; GFX908-NEXT:    v_pk_add_f16 v4, v5, v2
7902; GFX908-NEXT:    v_mov_b32_e32 v0, v4
7903; GFX908-NEXT:    v_mov_b32_e32 v1, v5
7904; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
7905; GFX908-NEXT:    s_waitcnt vmcnt(0)
7906; GFX908-NEXT:    buffer_wbinvl1
7907; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
7908; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7909; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7910; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
7911; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
7912; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7913; GFX908-NEXT:    s_setpc_b64 s[30:31]
7914;
7915; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
7916; GFX8:       ; %bb.0:
7917; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7918; GFX8-NEXT:    v_mov_b32_e32 v2, v0
7919; GFX8-NEXT:    v_mov_b32_e32 v0, s20
7920; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
7921; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
7922; GFX8-NEXT:    s_mov_b64 s[4:5], 0
7923; GFX8-NEXT:    v_mov_b32_e32 v3, s6
7924; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
7925; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
7926; GFX8-NEXT:    s_waitcnt vmcnt(0)
7927; GFX8-NEXT:    v_mov_b32_e32 v5, v0
7928; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
7929; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
7930; GFX8-NEXT:    v_add_f16_e32 v1, v5, v2
7931; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
7932; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
7933; GFX8-NEXT:    v_mov_b32_e32 v0, v4
7934; GFX8-NEXT:    v_mov_b32_e32 v1, v5
7935; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
7936; GFX8-NEXT:    s_waitcnt vmcnt(0)
7937; GFX8-NEXT:    buffer_wbinvl1
7938; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
7939; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7940; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7941; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
7942; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
7943; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7944; GFX8-NEXT:    s_setpc_b64 s[30:31]
7945;
7946; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
7947; GFX7:       ; %bb.0:
7948; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7949; GFX7-NEXT:    v_mov_b32_e32 v2, s20
7950; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
7951; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
7952; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
7953; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
7954; GFX7-NEXT:    s_mov_b64 s[4:5], 0
7955; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
7956; GFX7-NEXT:    s_waitcnt vmcnt(0)
7957; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
7958; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
7959; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
7960; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
7961; GFX7-NEXT:    v_mov_b32_e32 v4, s6
7962; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
7963; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
7964; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
7965; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
7966; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v1
7967; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
7968; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
7969; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
7970; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
7971; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
7972; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
7973; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
7974; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
7975; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
7976; GFX7-NEXT:    v_mov_b32_e32 v8, v6
7977; GFX7-NEXT:    v_mov_b32_e32 v7, v5
7978; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
7979; GFX7-NEXT:    s_waitcnt vmcnt(0)
7980; GFX7-NEXT:    buffer_wbinvl1
7981; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
7982; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v7
7983; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
7984; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
7985; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7986; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7987; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
7988; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
7989; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7990; GFX7-NEXT:    s_setpc_b64 s[30:31]
7991;
7992; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
7993; GFX6:       ; %bb.0:
7994; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7995; GFX6-NEXT:    v_mov_b32_e32 v2, s20
7996; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
7997; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
7998; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v0
7999; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
8000; GFX6-NEXT:    s_mov_b64 s[4:5], 0
8001; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v1
8002; GFX6-NEXT:    s_waitcnt vmcnt(0)
8003; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
8004; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
8005; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
8006; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
8007; GFX6-NEXT:    v_mov_b32_e32 v4, s6
8008; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
8009; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
8010; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
8011; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
8012; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v1
8013; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v0
8014; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
8015; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
8016; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
8017; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
8018; GFX6-NEXT:    s_waitcnt expcnt(0)
8019; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
8020; GFX6-NEXT:    v_or_b32_e32 v6, v0, v1
8021; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
8022; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
8023; GFX6-NEXT:    v_mov_b32_e32 v8, v6
8024; GFX6-NEXT:    v_mov_b32_e32 v7, v5
8025; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
8026; GFX6-NEXT:    s_waitcnt vmcnt(0)
8027; GFX6-NEXT:    buffer_wbinvl1
8028; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
8029; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v7
8030; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
8031; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
8032; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8033; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8034; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
8035; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
8036; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
8037; GFX6-NEXT:    s_waitcnt expcnt(0)
8038; GFX6-NEXT:    s_setpc_b64 s[30:31]
8039  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
8040  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
8041  ret <2 x half> %result
8042}
8043
8044define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
8045; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
8046; GFX12:       ; %bb.0:
8047; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8048; GFX12-NEXT:    s_wait_expcnt 0x0
8049; GFX12-NEXT:    s_wait_samplecnt 0x0
8050; GFX12-NEXT:    s_wait_bvhcnt 0x0
8051; GFX12-NEXT:    s_wait_kmcnt 0x0
8052; GFX12-NEXT:    v_mov_b32_e32 v1, s16
8053; GFX12-NEXT:    s_wait_storecnt 0x0
8054; GFX12-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
8055; GFX12-NEXT:    s_wait_storecnt 0x0
8056; GFX12-NEXT:    global_inv scope:SCOPE_DEV
8057; GFX12-NEXT:    s_setpc_b64 s[30:31]
8058;
8059; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
8060; GFX940:       ; %bb.0:
8061; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8062; GFX940-NEXT:    v_mov_b32_e32 v1, s16
8063; GFX940-NEXT:    buffer_wbl2 sc1
8064; GFX940-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
8065; GFX940-NEXT:    s_waitcnt vmcnt(0)
8066; GFX940-NEXT:    buffer_inv sc1
8067; GFX940-NEXT:    s_setpc_b64 s[30:31]
8068;
8069; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
8070; GFX11:       ; %bb.0:
8071; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8072; GFX11-NEXT:    v_mov_b32_e32 v1, s16
8073; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
8074; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8075; GFX11-NEXT:    v_mov_b32_e32 v3, s4
8076; GFX11-NEXT:    s_mov_b32 s4, 0
8077; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
8078; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
8079; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
8080; GFX11-NEXT:    s_waitcnt vmcnt(0)
8081; GFX11-NEXT:    v_pk_add_f16 v1, v2, v0
8082; GFX11-NEXT:    v_mov_b32_e32 v5, v2
8083; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
8084; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
8085; GFX11-NEXT:    v_mov_b32_e32 v4, v1
8086; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
8087; GFX11-NEXT:    s_waitcnt vmcnt(0)
8088; GFX11-NEXT:    buffer_gl1_inv
8089; GFX11-NEXT:    buffer_gl0_inv
8090; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
8091; GFX11-NEXT:    v_mov_b32_e32 v2, v4
8092; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
8093; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8094; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
8095; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
8096; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
8097; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
8098; GFX11-NEXT:    s_setpc_b64 s[30:31]
8099;
8100; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
8101; GFX10:       ; %bb.0:
8102; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8103; GFX10-NEXT:    v_mov_b32_e32 v1, s20
8104; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
8105; GFX10-NEXT:    v_mov_b32_e32 v3, s4
8106; GFX10-NEXT:    s_mov_b32 s4, 0
8107; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
8108; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
8109; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
8110; GFX10-NEXT:    s_waitcnt vmcnt(0)
8111; GFX10-NEXT:    v_pk_add_f16 v1, v2, v0
8112; GFX10-NEXT:    v_mov_b32_e32 v5, v2
8113; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
8114; GFX10-NEXT:    v_mov_b32_e32 v4, v1
8115; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
8116; GFX10-NEXT:    s_waitcnt vmcnt(0)
8117; GFX10-NEXT:    buffer_gl1_inv
8118; GFX10-NEXT:    buffer_gl0_inv
8119; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
8120; GFX10-NEXT:    v_mov_b32_e32 v2, v4
8121; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
8122; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
8123; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
8124; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
8125; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
8126; GFX10-NEXT:    s_setpc_b64 s[30:31]
8127;
8128; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
8129; GFX90A:       ; %bb.0:
8130; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8131; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
8132; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
8133; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
8134; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
8135; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
8136; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
8137; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
8138; GFX90A-NEXT:    s_waitcnt vmcnt(0)
8139; GFX90A-NEXT:    v_pk_add_f16 v2, v3, v0
8140; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
8141; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
8142; GFX90A-NEXT:    s_waitcnt vmcnt(0)
8143; GFX90A-NEXT:    buffer_wbinvl1
8144; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
8145; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8146; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
8147; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8148; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
8149; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
8150; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
8151; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8152;
8153; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
8154; GFX908:       ; %bb.0:
8155; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8156; GFX908-NEXT:    v_mov_b32_e32 v1, s20
8157; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
8158; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
8159; GFX908-NEXT:    s_mov_b64 s[4:5], 0
8160; GFX908-NEXT:    v_mov_b32_e32 v3, s6
8161; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
8162; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
8163; GFX908-NEXT:    s_waitcnt vmcnt(0)
8164; GFX908-NEXT:    v_pk_add_f16 v1, v2, v0
8165; GFX908-NEXT:    v_mov_b32_e32 v5, v2
8166; GFX908-NEXT:    v_mov_b32_e32 v4, v1
8167; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
8168; GFX908-NEXT:    s_waitcnt vmcnt(0)
8169; GFX908-NEXT:    buffer_wbinvl1
8170; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
8171; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8172; GFX908-NEXT:    v_mov_b32_e32 v2, v4
8173; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8174; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
8175; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
8176; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
8177; GFX908-NEXT:    s_setpc_b64 s[30:31]
8178;
8179; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
8180; GFX8:       ; %bb.0:
8181; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8182; GFX8-NEXT:    v_mov_b32_e32 v1, s20
8183; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
8184; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
8185; GFX8-NEXT:    s_mov_b64 s[4:5], 0
8186; GFX8-NEXT:    v_mov_b32_e32 v3, s6
8187; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
8188; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8189; GFX8-NEXT:    s_waitcnt vmcnt(0)
8190; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
8191; GFX8-NEXT:    v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
8192; GFX8-NEXT:    v_add_f16_e32 v4, v2, v0
8193; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
8194; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
8195; GFX8-NEXT:    v_mov_b32_e32 v5, v2
8196; GFX8-NEXT:    v_mov_b32_e32 v4, v1
8197; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
8198; GFX8-NEXT:    s_waitcnt vmcnt(0)
8199; GFX8-NEXT:    buffer_wbinvl1
8200; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
8201; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8202; GFX8-NEXT:    v_mov_b32_e32 v2, v4
8203; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8204; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
8205; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8206; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8207; GFX8-NEXT:    s_setpc_b64 s[30:31]
8208;
8209; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
8210; GFX7:       ; %bb.0:
8211; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8212; GFX7-NEXT:    v_mov_b32_e32 v2, s20
8213; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
8214; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
8215; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
8216; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
8217; GFX7-NEXT:    s_mov_b64 s[4:5], 0
8218; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
8219; GFX7-NEXT:    s_waitcnt vmcnt(0)
8220; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
8221; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
8222; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
8223; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
8224; GFX7-NEXT:    v_mov_b32_e32 v2, s6
8225; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
8226; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8227; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
8228; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
8229; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v4
8230; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
8231; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
8232; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
8233; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
8234; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v5
8235; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
8236; GFX7-NEXT:    v_or_b32_e32 v5, v3, v4
8237; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
8238; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
8239; GFX7-NEXT:    v_mov_b32_e32 v7, v5
8240; GFX7-NEXT:    v_mov_b32_e32 v6, v4
8241; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
8242; GFX7-NEXT:    s_waitcnt vmcnt(0)
8243; GFX7-NEXT:    buffer_wbinvl1
8244; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
8245; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
8246; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
8247; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
8248; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8249; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8250; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
8251; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
8252; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8253; GFX7-NEXT:    s_setpc_b64 s[30:31]
8254;
8255; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
8256; GFX6:       ; %bb.0:
8257; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8258; GFX6-NEXT:    v_mov_b32_e32 v2, s20
8259; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
8260; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
8261; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v0
8262; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
8263; GFX6-NEXT:    s_mov_b64 s[4:5], 0
8264; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v1
8265; GFX6-NEXT:    s_waitcnt vmcnt(0)
8266; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
8267; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
8268; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
8269; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
8270; GFX6-NEXT:    v_mov_b32_e32 v2, s6
8271; GFX6-NEXT:  .LBB25_1: ; %atomicrmw.start
8272; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
8273; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
8274; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
8275; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v4
8276; GFX6-NEXT:    s_waitcnt expcnt(0)
8277; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
8278; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
8279; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
8280; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
8281; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v5
8282; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
8283; GFX6-NEXT:    v_or_b32_e32 v5, v3, v4
8284; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
8285; GFX6-NEXT:    v_or_b32_e32 v4, v6, v3
8286; GFX6-NEXT:    v_mov_b32_e32 v7, v5
8287; GFX6-NEXT:    v_mov_b32_e32 v6, v4
8288; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
8289; GFX6-NEXT:    s_waitcnt vmcnt(0)
8290; GFX6-NEXT:    buffer_wbinvl1
8291; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
8292; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
8293; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
8294; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
8295; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8296; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8297; GFX6-NEXT:    s_cbranch_execnz .LBB25_1
8298; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
8299; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
8300; GFX6-NEXT:    s_waitcnt expcnt(0)
8301; GFX6-NEXT:    s_setpc_b64 s[30:31]
8302  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
8303  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
8304  ret void
8305}
8306
8307; --------------------------------------------------------------------
8308; <2 x bfloat>
8309; --------------------------------------------------------------------
8310
8311define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
8312; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8313; GFX12:       ; %bb.0:
8314; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8315; GFX12-NEXT:    s_wait_expcnt 0x0
8316; GFX12-NEXT:    s_wait_samplecnt 0x0
8317; GFX12-NEXT:    s_wait_bvhcnt 0x0
8318; GFX12-NEXT:    s_wait_kmcnt 0x0
8319; GFX12-NEXT:    v_mov_b32_e32 v1, s16
8320; GFX12-NEXT:    s_wait_storecnt 0x0
8321; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
8322; GFX12-NEXT:    s_wait_loadcnt 0x0
8323; GFX12-NEXT:    global_inv scope:SCOPE_DEV
8324; GFX12-NEXT:    s_setpc_b64 s[30:31]
8325;
8326; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8327; GFX940:       ; %bb.0:
8328; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8329; GFX940-NEXT:    v_mov_b32_e32 v1, v0
8330; GFX940-NEXT:    v_mov_b32_e32 v0, s16
8331; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
8332; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
8333; GFX940-NEXT:    s_mov_b64 s[6:7], 0
8334; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
8335; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
8336; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
8337; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
8338; GFX940-NEXT:    v_mov_b32_e32 v4, s4
8339; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
8340; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
8341; GFX940-NEXT:    s_waitcnt vmcnt(0)
8342; GFX940-NEXT:    v_mov_b32_e32 v7, v0
8343; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
8344; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
8345; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
8346; GFX940-NEXT:    v_add_f32_e32 v1, v1, v3
8347; GFX940-NEXT:    v_bfe_u32 v5, v0, 16, 1
8348; GFX940-NEXT:    v_bfe_u32 v8, v1, 16, 1
8349; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v0
8350; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v1
8351; GFX940-NEXT:    v_add3_u32 v5, v5, v0, s8
8352; GFX940-NEXT:    v_add3_u32 v8, v8, v1, s8
8353; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
8354; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
8355; GFX940-NEXT:    buffer_wbl2 sc1
8356; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
8357; GFX940-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
8358; GFX940-NEXT:    v_perm_b32 v6, v1, v0, s9
8359; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
8360; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
8361; GFX940-NEXT:    s_waitcnt vmcnt(0)
8362; GFX940-NEXT:    buffer_inv sc1
8363; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
8364; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
8365; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
8366; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
8367; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
8368; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
8369; GFX940-NEXT:    s_setpc_b64 s[30:31]
8370;
8371; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8372; GFX11:       ; %bb.0:
8373; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8374; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
8375; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
8376; GFX11-NEXT:    s_mov_b32 s5, 0
8377; GFX11-NEXT:    v_mov_b32_e32 v4, s4
8378; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
8379; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
8380; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
8381; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
8382; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
8383; GFX11-NEXT:    .p2align 6
8384; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
8385; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
8386; GFX11-NEXT:    s_waitcnt vmcnt(0)
8387; GFX11-NEXT:    v_mov_b32_e32 v6, v0
8388; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
8389; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8390; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
8391; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
8392; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
8393; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
8394; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
8395; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
8396; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
8397; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8398; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
8399; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
8400; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
8401; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
8402; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
8403; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
8404; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
8405; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8406; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
8407; GFX11-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
8408; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8409; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
8410; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
8411; GFX11-NEXT:    s_waitcnt vmcnt(0)
8412; GFX11-NEXT:    buffer_gl1_inv
8413; GFX11-NEXT:    buffer_gl0_inv
8414; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
8415; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
8416; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8417; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
8418; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
8419; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
8420; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
8421; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
8422; GFX11-NEXT:    s_setpc_b64 s[30:31]
8423;
8424; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8425; GFX10:       ; %bb.0:
8426; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8427; GFX10-NEXT:    v_mov_b32_e32 v1, v0
8428; GFX10-NEXT:    v_mov_b32_e32 v0, s20
8429; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
8430; GFX10-NEXT:    s_mov_b32 s5, 0
8431; GFX10-NEXT:    v_mov_b32_e32 v4, s4
8432; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
8433; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
8434; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
8435; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
8436; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
8437; GFX10-NEXT:    s_waitcnt vmcnt(0)
8438; GFX10-NEXT:    v_mov_b32_e32 v6, v0
8439; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
8440; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
8441; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
8442; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
8443; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
8444; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
8445; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
8446; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
8447; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
8448; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
8449; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
8450; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
8451; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
8452; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
8453; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
8454; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
8455; GFX10-NEXT:    v_mov_b32_e32 v0, v5
8456; GFX10-NEXT:    v_mov_b32_e32 v1, v6
8457; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
8458; GFX10-NEXT:    s_waitcnt vmcnt(0)
8459; GFX10-NEXT:    buffer_gl1_inv
8460; GFX10-NEXT:    buffer_gl0_inv
8461; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
8462; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
8463; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
8464; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
8465; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
8466; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
8467; GFX10-NEXT:    s_setpc_b64 s[30:31]
8468;
8469; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8470; GFX90A:       ; %bb.0:
8471; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8472; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
8473; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
8474; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
8475; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
8476; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
8477; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
8478; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
8479; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
8480; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
8481; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
8482; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
8483; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
8484; GFX90A-NEXT:    s_waitcnt vmcnt(0)
8485; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
8486; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
8487; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
8488; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
8489; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v3
8490; GFX90A-NEXT:    v_bfe_u32 v5, v0, 16, 1
8491; GFX90A-NEXT:    v_bfe_u32 v8, v1, 16, 1
8492; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v0
8493; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v1
8494; GFX90A-NEXT:    v_add3_u32 v5, v5, v0, s8
8495; GFX90A-NEXT:    v_add3_u32 v8, v8, v1, s8
8496; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
8497; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
8498; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
8499; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
8500; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
8501; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
8502; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
8503; GFX90A-NEXT:    s_waitcnt vmcnt(0)
8504; GFX90A-NEXT:    buffer_wbinvl1
8505; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
8506; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
8507; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
8508; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
8509; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
8510; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
8511; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8512;
8513; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8514; GFX908:       ; %bb.0:
8515; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8516; GFX908-NEXT:    v_mov_b32_e32 v1, v0
8517; GFX908-NEXT:    v_mov_b32_e32 v0, s20
8518; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
8519; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
8520; GFX908-NEXT:    s_mov_b64 s[6:7], 0
8521; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
8522; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
8523; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
8524; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
8525; GFX908-NEXT:    v_mov_b32_e32 v4, s4
8526; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
8527; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
8528; GFX908-NEXT:    s_waitcnt vmcnt(0)
8529; GFX908-NEXT:    v_mov_b32_e32 v6, v0
8530; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
8531; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
8532; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
8533; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
8534; GFX908-NEXT:    v_bfe_u32 v5, v0, 16, 1
8535; GFX908-NEXT:    v_bfe_u32 v8, v1, 16, 1
8536; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
8537; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v1
8538; GFX908-NEXT:    v_add3_u32 v5, v5, v0, s8
8539; GFX908-NEXT:    v_add3_u32 v8, v8, v1, s8
8540; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
8541; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
8542; GFX908-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
8543; GFX908-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
8544; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
8545; GFX908-NEXT:    v_mov_b32_e32 v0, v5
8546; GFX908-NEXT:    v_mov_b32_e32 v1, v6
8547; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
8548; GFX908-NEXT:    s_waitcnt vmcnt(0)
8549; GFX908-NEXT:    buffer_wbinvl1
8550; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
8551; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
8552; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
8553; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
8554; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
8555; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
8556; GFX908-NEXT:    s_setpc_b64 s[30:31]
8557;
8558; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8559; GFX8:       ; %bb.0:
8560; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8561; GFX8-NEXT:    v_mov_b32_e32 v1, v0
8562; GFX8-NEXT:    v_mov_b32_e32 v0, s20
8563; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
8564; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
8565; GFX8-NEXT:    s_mov_b64 s[6:7], 0
8566; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
8567; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
8568; GFX8-NEXT:    v_mov_b32_e32 v4, s4
8569; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
8570; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8571; GFX8-NEXT:    s_waitcnt vmcnt(0)
8572; GFX8-NEXT:    v_mov_b32_e32 v6, v0
8573; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
8574; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
8575; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
8576; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
8577; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
8578; GFX8-NEXT:    v_bfe_u32 v8, v1, 16, 1
8579; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
8580; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
8581; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
8582; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
8583; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
8584; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
8585; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
8586; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
8587; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
8588; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
8589; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
8590; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
8591; GFX8-NEXT:    v_mov_b32_e32 v0, v5
8592; GFX8-NEXT:    v_mov_b32_e32 v1, v6
8593; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
8594; GFX8-NEXT:    s_waitcnt vmcnt(0)
8595; GFX8-NEXT:    buffer_wbinvl1
8596; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
8597; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
8598; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
8599; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
8600; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8601; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
8602; GFX8-NEXT:    s_setpc_b64 s[30:31]
8603;
8604; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8605; GFX7:       ; %bb.0:
8606; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8607; GFX7-NEXT:    v_mov_b32_e32 v2, s20
8608; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
8609; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
8610; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
8611; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
8612; GFX7-NEXT:    s_mov_b64 s[4:5], 0
8613; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
8614; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
8615; GFX7-NEXT:    s_waitcnt vmcnt(0)
8616; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
8617; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
8618; GFX7-NEXT:    v_mov_b32_e32 v4, s6
8619; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
8620; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8621; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
8622; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
8623; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
8624; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
8625; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
8626; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
8627; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
8628; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
8629; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
8630; GFX7-NEXT:    v_alignbit_b32 v0, v0, v5, 16
8631; GFX7-NEXT:    v_mov_b32_e32 v6, v1
8632; GFX7-NEXT:    v_mov_b32_e32 v5, v0
8633; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
8634; GFX7-NEXT:    s_waitcnt vmcnt(0)
8635; GFX7-NEXT:    buffer_wbinvl1
8636; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
8637; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
8638; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8639; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
8640; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8641; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
8642; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
8643; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8644; GFX7-NEXT:    s_setpc_b64 s[30:31]
8645;
8646; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8647; GFX6:       ; %bb.0:
8648; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8649; GFX6-NEXT:    v_mov_b32_e32 v2, s20
8650; GFX6-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
8651; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
8652; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
8653; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
8654; GFX6-NEXT:    s_mov_b64 s[4:5], 0
8655; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
8656; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
8657; GFX6-NEXT:    s_waitcnt vmcnt(0)
8658; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
8659; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
8660; GFX6-NEXT:    v_mov_b32_e32 v4, s6
8661; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
8662; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
8663; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
8664; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
8665; GFX6-NEXT:    s_waitcnt expcnt(0)
8666; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
8667; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
8668; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
8669; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
8670; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
8671; GFX6-NEXT:    v_alignbit_b32 v1, v1, v0, 16
8672; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
8673; GFX6-NEXT:    v_alignbit_b32 v0, v0, v5, 16
8674; GFX6-NEXT:    v_mov_b32_e32 v6, v1
8675; GFX6-NEXT:    v_mov_b32_e32 v5, v0
8676; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
8677; GFX6-NEXT:    s_waitcnt vmcnt(0)
8678; GFX6-NEXT:    buffer_wbinvl1
8679; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
8680; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
8681; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8682; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
8683; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8684; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
8685; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
8686; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
8687; GFX6-NEXT:    s_waitcnt expcnt(0)
8688; GFX6-NEXT:    s_setpc_b64 s[30:31]
8689  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
8690  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
8691  ret <2 x bfloat> %result
8692}
8693
8694define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
8695; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8696; GFX12:       ; %bb.0:
8697; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8698; GFX12-NEXT:    s_wait_expcnt 0x0
8699; GFX12-NEXT:    s_wait_samplecnt 0x0
8700; GFX12-NEXT:    s_wait_bvhcnt 0x0
8701; GFX12-NEXT:    s_wait_kmcnt 0x0
8702; GFX12-NEXT:    v_mov_b32_e32 v1, s16
8703; GFX12-NEXT:    s_wait_storecnt 0x0
8704; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
8705; GFX12-NEXT:    s_wait_storecnt 0x0
8706; GFX12-NEXT:    global_inv scope:SCOPE_DEV
8707; GFX12-NEXT:    s_setpc_b64 s[30:31]
8708;
8709; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8710; GFX940:       ; %bb.0:
8711; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8712; GFX940-NEXT:    v_mov_b32_e32 v1, s16
8713; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
8714; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
8715; GFX940-NEXT:    s_mov_b64 s[6:7], 0
8716; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
8717; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
8718; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
8719; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
8720; GFX940-NEXT:    v_mov_b32_e32 v4, s4
8721; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
8722; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
8723; GFX940-NEXT:    s_waitcnt vmcnt(0)
8724; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
8725; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
8726; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
8727; GFX940-NEXT:    v_add_f32_e32 v5, v5, v3
8728; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
8729; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
8730; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
8731; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
8732; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
8733; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
8734; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
8735; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
8736; GFX940-NEXT:    buffer_wbl2 sc1
8737; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
8738; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
8739; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
8740; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
8741; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
8742; GFX940-NEXT:    s_waitcnt vmcnt(0)
8743; GFX940-NEXT:    buffer_inv sc1
8744; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
8745; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
8746; GFX940-NEXT:    v_mov_b32_e32 v1, v6
8747; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
8748; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
8749; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
8750; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
8751; GFX940-NEXT:    s_setpc_b64 s[30:31]
8752;
8753; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8754; GFX11:       ; %bb.0:
8755; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8756; GFX11-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
8757; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
8758; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8759; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
8760; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
8761; GFX11-NEXT:    s_mov_b32 s5, 0
8762; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
8763; GFX11-NEXT:    .p2align 6
8764; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
8765; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
8766; GFX11-NEXT:    s_waitcnt vmcnt(0)
8767; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
8768; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
8769; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
8770; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8771; GFX11-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
8772; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
8773; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
8774; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
8775; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
8776; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
8777; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
8778; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
8779; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
8780; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
8781; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
8782; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
8783; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
8784; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8785; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
8786; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
8787; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
8788; GFX11-NEXT:    s_waitcnt vmcnt(0)
8789; GFX11-NEXT:    buffer_gl1_inv
8790; GFX11-NEXT:    buffer_gl0_inv
8791; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
8792; GFX11-NEXT:    v_mov_b32_e32 v1, v5
8793; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
8794; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8795; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
8796; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
8797; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
8798; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
8799; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
8800; GFX11-NEXT:    s_setpc_b64 s[30:31]
8801;
8802; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8803; GFX10:       ; %bb.0:
8804; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8805; GFX10-NEXT:    v_mov_b32_e32 v1, s20
8806; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
8807; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
8808; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
8809; GFX10-NEXT:    v_mov_b32_e32 v4, s4
8810; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
8811; GFX10-NEXT:    s_mov_b32 s5, 0
8812; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
8813; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
8814; GFX10-NEXT:    s_waitcnt vmcnt(0)
8815; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
8816; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
8817; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
8818; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
8819; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
8820; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
8821; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
8822; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
8823; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
8824; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
8825; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
8826; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
8827; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
8828; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
8829; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
8830; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
8831; GFX10-NEXT:    v_mov_b32_e32 v6, v1
8832; GFX10-NEXT:    v_mov_b32_e32 v5, v0
8833; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
8834; GFX10-NEXT:    s_waitcnt vmcnt(0)
8835; GFX10-NEXT:    buffer_gl1_inv
8836; GFX10-NEXT:    buffer_gl0_inv
8837; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
8838; GFX10-NEXT:    v_mov_b32_e32 v1, v5
8839; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
8840; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
8841; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
8842; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
8843; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
8844; GFX10-NEXT:    s_setpc_b64 s[30:31]
8845;
8846; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8847; GFX90A:       ; %bb.0:
8848; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8849; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
8850; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
8851; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
8852; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
8853; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
8854; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
8855; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
8856; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
8857; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
8858; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
8859; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
8860; GFX90A-NEXT:    s_waitcnt vmcnt(0)
8861; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
8862; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
8863; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
8864; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
8865; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
8866; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
8867; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
8868; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
8869; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
8870; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
8871; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
8872; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
8873; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
8874; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
8875; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
8876; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
8877; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
8878; GFX90A-NEXT:    s_waitcnt vmcnt(0)
8879; GFX90A-NEXT:    buffer_wbinvl1
8880; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
8881; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
8882; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
8883; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
8884; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
8885; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
8886; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
8887; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8888;
8889; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8890; GFX908:       ; %bb.0:
8891; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8892; GFX908-NEXT:    v_mov_b32_e32 v1, s20
8893; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
8894; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
8895; GFX908-NEXT:    s_mov_b64 s[6:7], 0
8896; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
8897; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
8898; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
8899; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
8900; GFX908-NEXT:    v_mov_b32_e32 v4, s4
8901; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
8902; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
8903; GFX908-NEXT:    s_waitcnt vmcnt(0)
8904; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
8905; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
8906; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
8907; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
8908; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
8909; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
8910; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
8911; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
8912; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
8913; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
8914; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
8915; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
8916; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
8917; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
8918; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
8919; GFX908-NEXT:    v_mov_b32_e32 v6, v1
8920; GFX908-NEXT:    v_mov_b32_e32 v5, v0
8921; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
8922; GFX908-NEXT:    s_waitcnt vmcnt(0)
8923; GFX908-NEXT:    buffer_wbinvl1
8924; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
8925; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
8926; GFX908-NEXT:    v_mov_b32_e32 v1, v5
8927; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
8928; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
8929; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
8930; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
8931; GFX908-NEXT:    s_setpc_b64 s[30:31]
8932;
8933; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8934; GFX8:       ; %bb.0:
8935; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8936; GFX8-NEXT:    v_mov_b32_e32 v1, s20
8937; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
8938; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
8939; GFX8-NEXT:    s_mov_b64 s[6:7], 0
8940; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
8941; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
8942; GFX8-NEXT:    v_mov_b32_e32 v4, s4
8943; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
8944; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8945; GFX8-NEXT:    s_waitcnt vmcnt(0)
8946; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
8947; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
8948; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
8949; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
8950; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
8951; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
8952; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
8953; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
8954; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
8955; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
8956; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
8957; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
8958; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
8959; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
8960; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
8961; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
8962; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
8963; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
8964; GFX8-NEXT:    v_mov_b32_e32 v6, v1
8965; GFX8-NEXT:    v_mov_b32_e32 v5, v0
8966; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
8967; GFX8-NEXT:    s_waitcnt vmcnt(0)
8968; GFX8-NEXT:    buffer_wbinvl1
8969; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
8970; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
8971; GFX8-NEXT:    v_mov_b32_e32 v1, v5
8972; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
8973; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
8974; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8975; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
8976; GFX8-NEXT:    s_setpc_b64 s[30:31]
8977;
8978; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
8979; GFX7:       ; %bb.0:
8980; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8981; GFX7-NEXT:    v_mov_b32_e32 v2, s20
8982; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
8983; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
8984; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
8985; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
8986; GFX7-NEXT:    s_mov_b64 s[4:5], 0
8987; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
8988; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
8989; GFX7-NEXT:    s_waitcnt vmcnt(0)
8990; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
8991; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
8992; GFX7-NEXT:    v_mov_b32_e32 v2, s6
8993; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
8994; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8995; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
8996; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
8997; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
8998; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
8999; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
9000; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
9001; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
9002; GFX7-NEXT:    v_alignbit_b32 v4, v3, v4, 16
9003; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
9004; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
9005; GFX7-NEXT:    v_mov_b32_e32 v6, v4
9006; GFX7-NEXT:    v_mov_b32_e32 v5, v3
9007; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
9008; GFX7-NEXT:    s_waitcnt vmcnt(0)
9009; GFX7-NEXT:    buffer_wbinvl1
9010; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
9011; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
9012; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9013; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
9014; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9015; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
9016; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
9017; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
9018; GFX7-NEXT:    s_setpc_b64 s[30:31]
9019;
9020; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
9021; GFX6:       ; %bb.0:
9022; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9023; GFX6-NEXT:    v_mov_b32_e32 v2, s20
9024; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
9025; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
9026; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
9027; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
9028; GFX6-NEXT:    s_mov_b64 s[4:5], 0
9029; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
9030; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
9031; GFX6-NEXT:    s_waitcnt vmcnt(0)
9032; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
9033; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
9034; GFX6-NEXT:    v_mov_b32_e32 v2, s6
9035; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
9036; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
9037; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
9038; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
9039; GFX6-NEXT:    s_waitcnt expcnt(0)
9040; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
9041; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
9042; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
9043; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
9044; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
9045; GFX6-NEXT:    v_alignbit_b32 v4, v3, v4, 16
9046; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
9047; GFX6-NEXT:    v_alignbit_b32 v3, v3, v5, 16
9048; GFX6-NEXT:    v_mov_b32_e32 v6, v4
9049; GFX6-NEXT:    v_mov_b32_e32 v5, v3
9050; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
9051; GFX6-NEXT:    s_waitcnt vmcnt(0)
9052; GFX6-NEXT:    buffer_wbinvl1
9053; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
9054; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
9055; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9056; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
9057; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9058; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
9059; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
9060; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
9061; GFX6-NEXT:    s_waitcnt expcnt(0)
9062; GFX6-NEXT:    s_setpc_b64 s[30:31]
9063  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
9064  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
9065  ret void
9066}
9067
9068define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
9069; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
9070; GFX12:       ; %bb.0:
9071; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9072; GFX12-NEXT:    s_wait_expcnt 0x0
9073; GFX12-NEXT:    s_wait_samplecnt 0x0
9074; GFX12-NEXT:    s_wait_bvhcnt 0x0
9075; GFX12-NEXT:    s_wait_kmcnt 0x0
9076; GFX12-NEXT:    s_mov_b32 s1, exec_lo
9077; GFX12-NEXT:    s_wait_storecnt 0x0
9078; GFX12-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
9079; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
9080; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
9081; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
9082; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
9083; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
9084; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
9085; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
9086; GFX12-NEXT:    s_wait_alu 0xfffe
9087; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9088; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
9089; GFX12-NEXT:    s_wait_alu 0xfffe
9090; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
9091; GFX12-NEXT:    s_wait_loadcnt 0x0
9092; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
9093; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
9094; GFX12-NEXT:    ; implicit-def: $vgpr4
9095; GFX12-NEXT:    s_wait_alu 0xfffe
9096; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
9097; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
9098; GFX12-NEXT:  ; %bb.2:
9099; GFX12-NEXT:    s_mov_b32 exec_lo, s1
9100; GFX12-NEXT:    s_wait_loadcnt 0x0
9101; GFX12-NEXT:    v_mov_b32_e32 v0, v5
9102; GFX12-NEXT:    global_inv scope:SCOPE_DEV
9103; GFX12-NEXT:    s_wait_alu 0xfffe
9104; GFX12-NEXT:    s_setpc_b64 s[30:31]
9105;
9106; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
9107; GFX940:       ; %bb.0:
9108; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9109; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
9110; GFX940-NEXT:    s_mov_b64 s[2:3], exec
9111; GFX940-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
9112; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
9113; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
9114; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
9115; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
9116; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
9117; GFX940-NEXT:    s_nop 0
9118; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
9119; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
9120; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
9121; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
9122; GFX940-NEXT:    ; implicit-def: $vgpr4
9123; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
9124; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
9125; GFX940-NEXT:  ; %bb.2:
9126; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
9127; GFX940-NEXT:    s_mov_b64 s[2:3], 0
9128; GFX940-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
9129; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
9130; GFX940-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
9131; GFX940-NEXT:    s_mov_b32 s11, 0x7060302
9132; GFX940-NEXT:  .LBB28_3: ; %atomicrmw.start
9133; GFX940-NEXT:    ; =>This Loop Header: Depth=1
9134; GFX940-NEXT:    ; Child Loop BB28_4 Depth 2
9135; GFX940-NEXT:    s_waitcnt vmcnt(0)
9136; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
9137; GFX940-NEXT:    v_add_f32_e32 v4, v4, v9
9138; GFX940-NEXT:    v_bfe_u32 v5, v4, 16, 1
9139; GFX940-NEXT:    v_add3_u32 v5, v5, v4, s10
9140; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v4
9141; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
9142; GFX940-NEXT:    s_mov_b64 s[8:9], exec
9143; GFX940-NEXT:    buffer_wbl2 sc1
9144; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
9145; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
9146; GFX940-NEXT:    v_add_f32_e32 v5, v5, v10
9147; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
9148; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s10
9149; GFX940-NEXT:    v_or_b32_e32 v11, 0x400000, v5
9150; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9151; GFX940-NEXT:    s_nop 1
9152; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
9153; GFX940-NEXT:    v_perm_b32 v6, v5, v4, s11
9154; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
9155; GFX940-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
9156; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
9157; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
9158; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
9159; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
9160; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
9161; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
9162; GFX940-NEXT:    s_nop 0
9163; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
9164; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
9165; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
9166; GFX940-NEXT:    s_waitcnt vmcnt(0)
9167; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
9168; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
9169; GFX940-NEXT:    s_cbranch_execnz .LBB28_4
9170; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
9171; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
9172; GFX940-NEXT:    s_waitcnt vmcnt(0)
9173; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
9174; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
9175; GFX940-NEXT:    v_mov_b32_e32 v7, v4
9176; GFX940-NEXT:    buffer_inv sc1
9177; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
9178; GFX940-NEXT:    s_cbranch_execnz .LBB28_3
9179; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
9180; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
9181; GFX940-NEXT:    v_mov_b32_e32 v0, v4
9182; GFX940-NEXT:    s_setpc_b64 s[30:31]
9183;
9184; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
9185; GFX11:       ; %bb.0:
9186; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9187; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
9188; GFX11-NEXT:    s_mov_b32 s1, 0
9189; GFX11-NEXT:    s_mov_b32 s2, exec_lo
9190; GFX11-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
9191; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
9192; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
9193; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
9194; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
9195; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
9196; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
9197; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
9198; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
9199; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
9200; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
9201; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
9202; GFX11-NEXT:    ; implicit-def: $vgpr4
9203; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
9204; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
9205; GFX11-NEXT:  ; %bb.2:
9206; GFX11-NEXT:    s_mov_b32 exec_lo, s2
9207; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
9208; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
9209; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
9210; GFX11-NEXT:    .p2align 6
9211; GFX11-NEXT:  .LBB28_3: ; %atomicrmw.start
9212; GFX11-NEXT:    ; =>This Loop Header: Depth=1
9213; GFX11-NEXT:    ; Child Loop BB28_4 Depth 2
9214; GFX11-NEXT:    s_waitcnt vmcnt(0)
9215; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
9216; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
9217; GFX11-NEXT:    s_mov_b32 s2, exec_lo
9218; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
9219; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9220; GFX11-NEXT:    v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
9221; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
9222; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
9223; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
9224; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v4
9225; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
9226; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v5
9227; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
9228; GFX11-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
9229; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
9230; GFX11-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
9231; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9232; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
9233; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9234; GFX11-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
9235; GFX11-NEXT:    v_mov_b32_e32 v4, v5
9236; GFX11-NEXT:    v_mov_b32_e32 v5, v6
9237; GFX11-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
9238; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
9239; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
9240; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
9241; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
9242; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
9243; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
9244; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
9245; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
9246; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
9247; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
9248; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
9249; GFX11-NEXT:    s_waitcnt vmcnt(0)
9250; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
9251; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
9252; GFX11-NEXT:    s_cbranch_execnz .LBB28_4
9253; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
9254; GFX11-NEXT:    s_mov_b32 exec_lo, s2
9255; GFX11-NEXT:    s_waitcnt vmcnt(0)
9256; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
9257; GFX11-NEXT:    v_mov_b32_e32 v6, v4
9258; GFX11-NEXT:    buffer_gl1_inv
9259; GFX11-NEXT:    buffer_gl0_inv
9260; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
9261; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9262; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
9263; GFX11-NEXT:    s_cbranch_execnz .LBB28_3
9264; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
9265; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
9266; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
9267; GFX11-NEXT:    v_mov_b32_e32 v0, v4
9268; GFX11-NEXT:    s_setpc_b64 s[30:31]
9269;
9270; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
9271; GFX10:       ; %bb.0:
9272; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9273; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
9274; GFX10-NEXT:    s_mov_b32 s5, 0
9275; GFX10-NEXT:    s_mov_b32 s6, exec_lo
9276; GFX10-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
9277; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
9278; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
9279; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
9280; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
9281; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
9282; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
9283; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
9284; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
9285; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
9286; GFX10-NEXT:    ; implicit-def: $vgpr4
9287; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
9288; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
9289; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
9290; GFX10-NEXT:  ; %bb.2:
9291; GFX10-NEXT:    s_mov_b32 exec_lo, s6
9292; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
9293; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
9294; GFX10-NEXT:  .LBB28_3: ; %atomicrmw.start
9295; GFX10-NEXT:    ; =>This Loop Header: Depth=1
9296; GFX10-NEXT:    ; Child Loop BB28_4 Depth 2
9297; GFX10-NEXT:    s_waitcnt vmcnt(0)
9298; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
9299; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
9300; GFX10-NEXT:    s_mov_b32 s6, exec_lo
9301; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
9302; GFX10-NEXT:    v_add_f32_e32 v4, v4, v8
9303; GFX10-NEXT:    v_add_f32_e32 v5, v5, v9
9304; GFX10-NEXT:    v_bfe_u32 v10, v4, 16, 1
9305; GFX10-NEXT:    v_bfe_u32 v11, v5, 16, 1
9306; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v4
9307; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
9308; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v5
9309; GFX10-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
9310; GFX10-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
9311; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
9312; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9313; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
9314; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
9315; GFX10-NEXT:    v_mov_b32_e32 v4, v5
9316; GFX10-NEXT:    v_mov_b32_e32 v5, v6
9317; GFX10-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
9318; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
9319; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
9320; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
9321; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
9322; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
9323; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
9324; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
9325; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
9326; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
9327; GFX10-NEXT:    s_waitcnt vmcnt(0)
9328; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
9329; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
9330; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
9331; GFX10-NEXT:    s_cbranch_execnz .LBB28_4
9332; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
9333; GFX10-NEXT:    s_mov_b32 exec_lo, s6
9334; GFX10-NEXT:    s_waitcnt vmcnt(0)
9335; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
9336; GFX10-NEXT:    v_mov_b32_e32 v6, v4
9337; GFX10-NEXT:    buffer_gl1_inv
9338; GFX10-NEXT:    buffer_gl0_inv
9339; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
9340; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
9341; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
9342; GFX10-NEXT:    s_cbranch_execnz .LBB28_3
9343; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
9344; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
9345; GFX10-NEXT:    v_mov_b32_e32 v0, v4
9346; GFX10-NEXT:    s_setpc_b64 s[30:31]
9347;
9348; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
9349; GFX90A:       ; %bb.0:
9350; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9351; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
9352; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
9353; GFX90A-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
9354; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
9355; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
9356; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
9357; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
9358; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
9359; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
9360; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
9361; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
9362; GFX90A-NEXT:    s_nop 0
9363; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
9364; GFX90A-NEXT:    ; implicit-def: $vgpr4
9365; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
9366; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
9367; GFX90A-NEXT:  ; %bb.2:
9368; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
9369; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
9370; GFX90A-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
9371; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
9372; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
9373; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
9374; GFX90A-NEXT:  .LBB28_3: ; %atomicrmw.start
9375; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
9376; GFX90A-NEXT:    ; Child Loop BB28_4 Depth 2
9377; GFX90A-NEXT:    s_waitcnt vmcnt(0)
9378; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
9379; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v9
9380; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
9381; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
9382; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
9383; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
9384; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
9385; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
9386; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v10
9387; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
9388; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s14
9389; GFX90A-NEXT:    v_or_b32_e32 v11, 0x400000, v5
9390; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9391; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
9392; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
9393; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
9394; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
9395; GFX90A-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
9396; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
9397; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
9398; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
9399; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
9400; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
9401; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
9402; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
9403; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
9404; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
9405; GFX90A-NEXT:    s_waitcnt vmcnt(0)
9406; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
9407; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
9408; GFX90A-NEXT:    s_cbranch_execnz .LBB28_4
9409; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
9410; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
9411; GFX90A-NEXT:    s_waitcnt vmcnt(0)
9412; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
9413; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
9414; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
9415; GFX90A-NEXT:    buffer_wbinvl1
9416; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
9417; GFX90A-NEXT:    s_cbranch_execnz .LBB28_3
9418; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
9419; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
9420; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
9421; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9422;
9423; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
9424; GFX908:       ; %bb.0:
9425; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9426; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
9427; GFX908-NEXT:    s_mov_b64 s[6:7], exec
9428; GFX908-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
9429; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
9430; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
9431; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
9432; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
9433; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
9434; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
9435; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
9436; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
9437; GFX908-NEXT:    s_nop 0
9438; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
9439; GFX908-NEXT:    ; implicit-def: $vgpr4
9440; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
9441; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
9442; GFX908-NEXT:  ; %bb.2:
9443; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
9444; GFX908-NEXT:    s_mov_b64 s[6:7], 0
9445; GFX908-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
9446; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
9447; GFX908-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
9448; GFX908-NEXT:    s_mov_b32 s15, 0x7060302
9449; GFX908-NEXT:  .LBB28_3: ; %atomicrmw.start
9450; GFX908-NEXT:    ; =>This Loop Header: Depth=1
9451; GFX908-NEXT:    ; Child Loop BB28_4 Depth 2
9452; GFX908-NEXT:    s_waitcnt vmcnt(0)
9453; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
9454; GFX908-NEXT:    v_add_f32_e32 v4, v4, v8
9455; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
9456; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
9457; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v4
9458; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
9459; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
9460; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
9461; GFX908-NEXT:    v_add_f32_e32 v5, v5, v9
9462; GFX908-NEXT:    v_bfe_u32 v10, v5, 16, 1
9463; GFX908-NEXT:    v_add3_u32 v10, v10, v5, s14
9464; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v5
9465; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9466; GFX908-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
9467; GFX908-NEXT:    v_perm_b32 v5, v5, v4, s15
9468; GFX908-NEXT:    v_mov_b32_e32 v4, v5
9469; GFX908-NEXT:    s_mov_b64 s[12:13], exec
9470; GFX908-NEXT:    v_mov_b32_e32 v5, v6
9471; GFX908-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
9472; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
9473; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
9474; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
9475; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
9476; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
9477; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
9478; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
9479; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
9480; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
9481; GFX908-NEXT:    s_waitcnt vmcnt(0)
9482; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
9483; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
9484; GFX908-NEXT:    s_cbranch_execnz .LBB28_4
9485; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
9486; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
9487; GFX908-NEXT:    s_waitcnt vmcnt(0)
9488; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
9489; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
9490; GFX908-NEXT:    v_mov_b32_e32 v6, v4
9491; GFX908-NEXT:    buffer_wbinvl1
9492; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
9493; GFX908-NEXT:    s_cbranch_execnz .LBB28_3
9494; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
9495; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
9496; GFX908-NEXT:    v_mov_b32_e32 v0, v4
9497; GFX908-NEXT:    s_setpc_b64 s[30:31]
9498;
9499; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
9500; GFX8:       ; %bb.0:
9501; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9502; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
9503; GFX8-NEXT:    s_mov_b64 s[6:7], exec
9504; GFX8-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
9505; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
9506; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
9507; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
9508; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
9509; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
9510; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
9511; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
9512; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
9513; GFX8-NEXT:    s_nop 0
9514; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
9515; GFX8-NEXT:    ; implicit-def: $vgpr4
9516; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
9517; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
9518; GFX8-NEXT:  ; %bb.2:
9519; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
9520; GFX8-NEXT:    s_mov_b64 s[6:7], 0
9521; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
9522; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
9523; GFX8-NEXT:  .LBB28_3: ; %atomicrmw.start
9524; GFX8-NEXT:    ; =>This Loop Header: Depth=1
9525; GFX8-NEXT:    ; Child Loop BB28_4 Depth 2
9526; GFX8-NEXT:    s_waitcnt vmcnt(0)
9527; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
9528; GFX8-NEXT:    v_add_f32_e32 v4, v4, v8
9529; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
9530; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
9531; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
9532; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v4
9533; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
9534; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
9535; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
9536; GFX8-NEXT:    v_add_f32_e32 v5, v5, v9
9537; GFX8-NEXT:    v_bfe_u32 v10, v5, 16, 1
9538; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v5
9539; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
9540; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v5
9541; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9542; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
9543; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9544; GFX8-NEXT:    v_alignbit_b32 v5, v5, v4, 16
9545; GFX8-NEXT:    v_mov_b32_e32 v4, v5
9546; GFX8-NEXT:    s_mov_b64 s[12:13], exec
9547; GFX8-NEXT:    v_mov_b32_e32 v5, v6
9548; GFX8-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
9549; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
9550; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
9551; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
9552; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
9553; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
9554; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
9555; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
9556; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
9557; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
9558; GFX8-NEXT:    s_waitcnt vmcnt(0)
9559; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
9560; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
9561; GFX8-NEXT:    s_cbranch_execnz .LBB28_4
9562; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
9563; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
9564; GFX8-NEXT:    s_waitcnt vmcnt(0)
9565; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
9566; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
9567; GFX8-NEXT:    v_mov_b32_e32 v6, v4
9568; GFX8-NEXT:    buffer_wbinvl1
9569; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
9570; GFX8-NEXT:    s_cbranch_execnz .LBB28_3
9571; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
9572; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
9573; GFX8-NEXT:    v_mov_b32_e32 v0, v4
9574; GFX8-NEXT:    s_setpc_b64 s[30:31]
9575;
9576; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
9577; GFX7:       ; %bb.0:
9578; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9579; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
9580; GFX7-NEXT:    s_mov_b64 s[6:7], exec
9581; GFX7-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
9582; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
9583; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
9584; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
9585; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
9586; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
9587; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
9588; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
9589; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
9590; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
9591; GFX7-NEXT:    ; implicit-def: $vgpr4
9592; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
9593; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
9594; GFX7-NEXT:  ; %bb.2:
9595; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
9596; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
9597; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
9598; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v6
9599; GFX7-NEXT:    s_waitcnt vmcnt(0)
9600; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
9601; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
9602; GFX7-NEXT:    s_mov_b64 s[6:7], 0
9603; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
9604; GFX7-NEXT:  .LBB28_3: ; %atomicrmw.start
9605; GFX7-NEXT:    ; =>This Loop Header: Depth=1
9606; GFX7-NEXT:    ; Child Loop BB28_4 Depth 2
9607; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v7
9608; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v4
9609; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
9610; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
9611; GFX7-NEXT:    v_add_f32_e32 v4, v4, v10
9612; GFX7-NEXT:    v_add_f32_e32 v6, v6, v9
9613; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
9614; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
9615; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
9616; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
9617; GFX7-NEXT:    v_mov_b32_e32 v7, v5
9618; GFX7-NEXT:    s_mov_b64 s[12:13], exec
9619; GFX7-NEXT:    v_mov_b32_e32 v6, v4
9620; GFX7-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
9621; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
9622; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
9623; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
9624; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
9625; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
9626; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
9627; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
9628; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
9629; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
9630; GFX7-NEXT:    s_waitcnt vmcnt(0)
9631; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
9632; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
9633; GFX7-NEXT:    s_cbranch_execnz .LBB28_4
9634; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
9635; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
9636; GFX7-NEXT:    s_waitcnt vmcnt(0)
9637; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
9638; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
9639; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
9640; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
9641; GFX7-NEXT:    buffer_wbinvl1
9642; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
9643; GFX7-NEXT:    s_cbranch_execnz .LBB28_3
9644; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
9645; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
9646; GFX7-NEXT:    v_mov_b32_e32 v0, v7
9647; GFX7-NEXT:    v_mov_b32_e32 v1, v4
9648; GFX7-NEXT:    s_setpc_b64 s[30:31]
9649;
9650; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
9651; GFX6:       ; %bb.0:
9652; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9653; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
9654; GFX6-NEXT:    s_mov_b64 s[6:7], exec
9655; GFX6-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
9656; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
9657; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
9658; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
9659; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
9660; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
9661; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
9662; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
9663; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
9664; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
9665; GFX6-NEXT:    ; implicit-def: $vgpr4
9666; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
9667; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
9668; GFX6-NEXT:  ; %bb.2:
9669; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
9670; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
9671; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
9672; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v6
9673; GFX6-NEXT:    s_waitcnt vmcnt(0)
9674; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
9675; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
9676; GFX6-NEXT:    s_mov_b64 s[6:7], 0
9677; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
9678; GFX6-NEXT:  .LBB28_3: ; %atomicrmw.start
9679; GFX6-NEXT:    ; =>This Loop Header: Depth=1
9680; GFX6-NEXT:    ; Child Loop BB28_4 Depth 2
9681; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v7
9682; GFX6-NEXT:    v_mul_f32_e32 v7, 1.0, v4
9683; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
9684; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
9685; GFX6-NEXT:    v_add_f32_e32 v4, v4, v10
9686; GFX6-NEXT:    v_add_f32_e32 v6, v6, v9
9687; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
9688; GFX6-NEXT:    v_alignbit_b32 v4, v4, v6, 16
9689; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
9690; GFX6-NEXT:    v_alignbit_b32 v5, v6, v5, 16
9691; GFX6-NEXT:    v_mov_b32_e32 v7, v5
9692; GFX6-NEXT:    s_mov_b64 s[12:13], exec
9693; GFX6-NEXT:    v_mov_b32_e32 v6, v4
9694; GFX6-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
9695; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
9696; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
9697; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
9698; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
9699; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
9700; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
9701; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
9702; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
9703; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
9704; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
9705; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
9706; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
9707; GFX6-NEXT:    s_cbranch_execnz .LBB28_4
9708; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
9709; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
9710; GFX6-NEXT:    s_waitcnt vmcnt(0)
9711; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
9712; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
9713; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
9714; GFX6-NEXT:    s_waitcnt expcnt(0)
9715; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
9716; GFX6-NEXT:    buffer_wbinvl1
9717; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
9718; GFX6-NEXT:    s_cbranch_execnz .LBB28_3
9719; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
9720; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
9721; GFX6-NEXT:    v_mov_b32_e32 v0, v7
9722; GFX6-NEXT:    v_mov_b32_e32 v1, v4
9723; GFX6-NEXT:    s_setpc_b64 s[30:31]
9724  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
9725  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
9726  ret <2 x bfloat> %result
9727}
9728
9729define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) {
9730; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
9731; GFX12:       ; %bb.0:
9732; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9733; GFX12-NEXT:    s_wait_expcnt 0x0
9734; GFX12-NEXT:    s_wait_samplecnt 0x0
9735; GFX12-NEXT:    s_wait_bvhcnt 0x0
9736; GFX12-NEXT:    s_wait_kmcnt 0x0
9737; GFX12-NEXT:    v_mov_b32_e32 v1, s16
9738; GFX12-NEXT:    s_wait_storecnt 0x0
9739; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
9740; GFX12-NEXT:    s_wait_loadcnt 0x0
9741; GFX12-NEXT:    global_inv scope:SCOPE_DEV
9742; GFX12-NEXT:    s_setpc_b64 s[30:31]
9743;
9744; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
9745; GFX940:       ; %bb.0:
9746; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9747; GFX940-NEXT:    v_mov_b32_e32 v1, v0
9748; GFX940-NEXT:    v_mov_b32_e32 v0, s16
9749; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
9750; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
9751; GFX940-NEXT:    s_mov_b64 s[6:7], 0
9752; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
9753; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
9754; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
9755; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
9756; GFX940-NEXT:    v_mov_b32_e32 v4, s4
9757; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
9758; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
9759; GFX940-NEXT:    s_waitcnt vmcnt(0)
9760; GFX940-NEXT:    v_mov_b32_e32 v7, v0
9761; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
9762; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
9763; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
9764; GFX940-NEXT:    v_add_f32_e32 v1, v1, v3
9765; GFX940-NEXT:    v_bfe_u32 v5, v0, 16, 1
9766; GFX940-NEXT:    v_bfe_u32 v8, v1, 16, 1
9767; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v0
9768; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v1
9769; GFX940-NEXT:    v_add3_u32 v5, v5, v0, s8
9770; GFX940-NEXT:    v_add3_u32 v8, v8, v1, s8
9771; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
9772; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
9773; GFX940-NEXT:    buffer_wbl2 sc1
9774; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
9775; GFX940-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
9776; GFX940-NEXT:    v_perm_b32 v6, v1, v0, s9
9777; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
9778; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
9779; GFX940-NEXT:    s_waitcnt vmcnt(0)
9780; GFX940-NEXT:    buffer_inv sc1
9781; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
9782; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
9783; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
9784; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
9785; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
9786; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
9787; GFX940-NEXT:    s_setpc_b64 s[30:31]
9788;
9789; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
9790; GFX11:       ; %bb.0:
9791; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9792; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
9793; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
9794; GFX11-NEXT:    s_mov_b32 s5, 0
9795; GFX11-NEXT:    v_mov_b32_e32 v4, s4
9796; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
9797; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
9798; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
9799; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
9800; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
9801; GFX11-NEXT:    .p2align 6
9802; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
9803; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
9804; GFX11-NEXT:    s_waitcnt vmcnt(0)
9805; GFX11-NEXT:    v_mov_b32_e32 v6, v0
9806; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
9807; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9808; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
9809; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
9810; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
9811; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
9812; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
9813; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
9814; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
9815; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9816; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
9817; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
9818; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
9819; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
9820; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
9821; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
9822; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
9823; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9824; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
9825; GFX11-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
9826; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9827; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
9828; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
9829; GFX11-NEXT:    s_waitcnt vmcnt(0)
9830; GFX11-NEXT:    buffer_gl1_inv
9831; GFX11-NEXT:    buffer_gl0_inv
9832; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
9833; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
9834; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9835; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
9836; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
9837; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
9838; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
9839; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
9840; GFX11-NEXT:    s_setpc_b64 s[30:31]
9841;
9842; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
9843; GFX10:       ; %bb.0:
9844; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9845; GFX10-NEXT:    v_mov_b32_e32 v1, v0
9846; GFX10-NEXT:    v_mov_b32_e32 v0, s20
9847; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
9848; GFX10-NEXT:    s_mov_b32 s5, 0
9849; GFX10-NEXT:    v_mov_b32_e32 v4, s4
9850; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
9851; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
9852; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
9853; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
9854; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
9855; GFX10-NEXT:    s_waitcnt vmcnt(0)
9856; GFX10-NEXT:    v_mov_b32_e32 v6, v0
9857; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
9858; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
9859; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
9860; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
9861; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
9862; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
9863; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
9864; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
9865; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
9866; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
9867; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
9868; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
9869; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
9870; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
9871; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
9872; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
9873; GFX10-NEXT:    v_mov_b32_e32 v0, v5
9874; GFX10-NEXT:    v_mov_b32_e32 v1, v6
9875; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
9876; GFX10-NEXT:    s_waitcnt vmcnt(0)
9877; GFX10-NEXT:    buffer_gl1_inv
9878; GFX10-NEXT:    buffer_gl0_inv
9879; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
9880; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
9881; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
9882; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
9883; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
9884; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
9885; GFX10-NEXT:    s_setpc_b64 s[30:31]
9886;
9887; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
9888; GFX90A:       ; %bb.0:
9889; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9890; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
9891; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
9892; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
9893; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
9894; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
9895; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
9896; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
9897; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
9898; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
9899; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
9900; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
9901; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
9902; GFX90A-NEXT:    s_waitcnt vmcnt(0)
9903; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
9904; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
9905; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
9906; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
9907; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v3
9908; GFX90A-NEXT:    v_bfe_u32 v5, v0, 16, 1
9909; GFX90A-NEXT:    v_bfe_u32 v8, v1, 16, 1
9910; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v0
9911; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v1
9912; GFX90A-NEXT:    v_add3_u32 v5, v5, v0, s8
9913; GFX90A-NEXT:    v_add3_u32 v8, v8, v1, s8
9914; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
9915; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
9916; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
9917; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
9918; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
9919; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
9920; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
9921; GFX90A-NEXT:    s_waitcnt vmcnt(0)
9922; GFX90A-NEXT:    buffer_wbinvl1
9923; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
9924; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
9925; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
9926; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
9927; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
9928; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
9929; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9930;
9931; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
9932; GFX908:       ; %bb.0:
9933; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9934; GFX908-NEXT:    v_mov_b32_e32 v1, v0
9935; GFX908-NEXT:    v_mov_b32_e32 v0, s20
9936; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
9937; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
9938; GFX908-NEXT:    s_mov_b64 s[6:7], 0
9939; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
9940; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
9941; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
9942; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
9943; GFX908-NEXT:    v_mov_b32_e32 v4, s4
9944; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
9945; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
9946; GFX908-NEXT:    s_waitcnt vmcnt(0)
9947; GFX908-NEXT:    v_mov_b32_e32 v6, v0
9948; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
9949; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
9950; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
9951; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
9952; GFX908-NEXT:    v_bfe_u32 v5, v0, 16, 1
9953; GFX908-NEXT:    v_bfe_u32 v8, v1, 16, 1
9954; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
9955; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v1
9956; GFX908-NEXT:    v_add3_u32 v5, v5, v0, s8
9957; GFX908-NEXT:    v_add3_u32 v8, v8, v1, s8
9958; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
9959; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
9960; GFX908-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
9961; GFX908-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
9962; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
9963; GFX908-NEXT:    v_mov_b32_e32 v0, v5
9964; GFX908-NEXT:    v_mov_b32_e32 v1, v6
9965; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
9966; GFX908-NEXT:    s_waitcnt vmcnt(0)
9967; GFX908-NEXT:    buffer_wbinvl1
9968; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
9969; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
9970; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
9971; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
9972; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
9973; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
9974; GFX908-NEXT:    s_setpc_b64 s[30:31]
9975;
9976; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
9977; GFX8:       ; %bb.0:
9978; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9979; GFX8-NEXT:    v_mov_b32_e32 v1, v0
9980; GFX8-NEXT:    v_mov_b32_e32 v0, s20
9981; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
9982; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
9983; GFX8-NEXT:    s_mov_b64 s[6:7], 0
9984; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
9985; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
9986; GFX8-NEXT:    v_mov_b32_e32 v4, s4
9987; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
9988; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
9989; GFX8-NEXT:    s_waitcnt vmcnt(0)
9990; GFX8-NEXT:    v_mov_b32_e32 v6, v0
9991; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
9992; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
9993; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
9994; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
9995; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
9996; GFX8-NEXT:    v_bfe_u32 v8, v1, 16, 1
9997; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
9998; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
9999; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
10000; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
10001; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
10002; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
10003; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
10004; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
10005; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
10006; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
10007; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
10008; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
10009; GFX8-NEXT:    v_mov_b32_e32 v0, v5
10010; GFX8-NEXT:    v_mov_b32_e32 v1, v6
10011; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
10012; GFX8-NEXT:    s_waitcnt vmcnt(0)
10013; GFX8-NEXT:    buffer_wbinvl1
10014; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
10015; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
10016; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
10017; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
10018; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
10019; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
10020; GFX8-NEXT:    s_setpc_b64 s[30:31]
10021;
10022; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
10023; GFX7:       ; %bb.0:
10024; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10025; GFX7-NEXT:    v_mov_b32_e32 v2, s20
10026; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
10027; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
10028; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
10029; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
10030; GFX7-NEXT:    s_mov_b64 s[4:5], 0
10031; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
10032; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
10033; GFX7-NEXT:    s_waitcnt vmcnt(0)
10034; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
10035; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
10036; GFX7-NEXT:    v_mov_b32_e32 v4, s6
10037; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
10038; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
10039; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
10040; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
10041; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
10042; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
10043; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
10044; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
10045; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
10046; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
10047; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
10048; GFX7-NEXT:    v_alignbit_b32 v0, v0, v5, 16
10049; GFX7-NEXT:    v_mov_b32_e32 v6, v1
10050; GFX7-NEXT:    v_mov_b32_e32 v5, v0
10051; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
10052; GFX7-NEXT:    s_waitcnt vmcnt(0)
10053; GFX7-NEXT:    buffer_wbinvl1
10054; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
10055; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
10056; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10057; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
10058; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10059; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
10060; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
10061; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
10062; GFX7-NEXT:    s_setpc_b64 s[30:31]
10063;
10064; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
10065; GFX6:       ; %bb.0:
10066; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10067; GFX6-NEXT:    v_mov_b32_e32 v2, s20
10068; GFX6-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
10069; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
10070; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
10071; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
10072; GFX6-NEXT:    s_mov_b64 s[4:5], 0
10073; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
10074; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
10075; GFX6-NEXT:    s_waitcnt vmcnt(0)
10076; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
10077; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
10078; GFX6-NEXT:    v_mov_b32_e32 v4, s6
10079; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
10080; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
10081; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
10082; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
10083; GFX6-NEXT:    s_waitcnt expcnt(0)
10084; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
10085; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
10086; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
10087; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
10088; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
10089; GFX6-NEXT:    v_alignbit_b32 v1, v1, v0, 16
10090; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
10091; GFX6-NEXT:    v_alignbit_b32 v0, v0, v5, 16
10092; GFX6-NEXT:    v_mov_b32_e32 v6, v1
10093; GFX6-NEXT:    v_mov_b32_e32 v5, v0
10094; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
10095; GFX6-NEXT:    s_waitcnt vmcnt(0)
10096; GFX6-NEXT:    buffer_wbinvl1
10097; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
10098; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
10099; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10100; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
10101; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10102; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
10103; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
10104; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
10105; GFX6-NEXT:    s_waitcnt expcnt(0)
10106; GFX6-NEXT:    s_setpc_b64 s[30:31]
10107  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
10108  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
10109  ret <2 x bfloat> %result
10110}
10111
10112define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) {
10113; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
10114; GFX12:       ; %bb.0:
10115; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10116; GFX12-NEXT:    s_wait_expcnt 0x0
10117; GFX12-NEXT:    s_wait_samplecnt 0x0
10118; GFX12-NEXT:    s_wait_bvhcnt 0x0
10119; GFX12-NEXT:    s_wait_kmcnt 0x0
10120; GFX12-NEXT:    v_mov_b32_e32 v1, s16
10121; GFX12-NEXT:    s_wait_storecnt 0x0
10122; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
10123; GFX12-NEXT:    s_wait_storecnt 0x0
10124; GFX12-NEXT:    global_inv scope:SCOPE_DEV
10125; GFX12-NEXT:    s_setpc_b64 s[30:31]
10126;
10127; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
10128; GFX940:       ; %bb.0:
10129; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10130; GFX940-NEXT:    v_mov_b32_e32 v1, s16
10131; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
10132; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
10133; GFX940-NEXT:    s_mov_b64 s[6:7], 0
10134; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
10135; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
10136; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
10137; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
10138; GFX940-NEXT:    v_mov_b32_e32 v4, s4
10139; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
10140; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
10141; GFX940-NEXT:    s_waitcnt vmcnt(0)
10142; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
10143; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
10144; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
10145; GFX940-NEXT:    v_add_f32_e32 v5, v5, v3
10146; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
10147; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
10148; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
10149; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
10150; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
10151; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
10152; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
10153; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
10154; GFX940-NEXT:    buffer_wbl2 sc1
10155; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
10156; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
10157; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
10158; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
10159; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
10160; GFX940-NEXT:    s_waitcnt vmcnt(0)
10161; GFX940-NEXT:    buffer_inv sc1
10162; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
10163; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
10164; GFX940-NEXT:    v_mov_b32_e32 v1, v6
10165; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
10166; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
10167; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
10168; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
10169; GFX940-NEXT:    s_setpc_b64 s[30:31]
10170;
10171; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
10172; GFX11:       ; %bb.0:
10173; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10174; GFX11-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
10175; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
10176; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10177; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
10178; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
10179; GFX11-NEXT:    s_mov_b32 s5, 0
10180; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
10181; GFX11-NEXT:    .p2align 6
10182; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
10183; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
10184; GFX11-NEXT:    s_waitcnt vmcnt(0)
10185; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
10186; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
10187; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
10188; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10189; GFX11-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
10190; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
10191; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
10192; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
10193; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
10194; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
10195; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
10196; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
10197; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
10198; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
10199; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
10200; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
10201; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
10202; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10203; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
10204; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
10205; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
10206; GFX11-NEXT:    s_waitcnt vmcnt(0)
10207; GFX11-NEXT:    buffer_gl1_inv
10208; GFX11-NEXT:    buffer_gl0_inv
10209; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
10210; GFX11-NEXT:    v_mov_b32_e32 v1, v5
10211; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
10212; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10213; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
10214; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
10215; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
10216; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
10217; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
10218; GFX11-NEXT:    s_setpc_b64 s[30:31]
10219;
10220; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
10221; GFX10:       ; %bb.0:
10222; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10223; GFX10-NEXT:    v_mov_b32_e32 v1, s20
10224; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
10225; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
10226; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
10227; GFX10-NEXT:    v_mov_b32_e32 v4, s4
10228; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
10229; GFX10-NEXT:    s_mov_b32 s5, 0
10230; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
10231; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
10232; GFX10-NEXT:    s_waitcnt vmcnt(0)
10233; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
10234; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
10235; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
10236; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
10237; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
10238; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
10239; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
10240; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
10241; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
10242; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
10243; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
10244; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
10245; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
10246; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
10247; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
10248; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
10249; GFX10-NEXT:    v_mov_b32_e32 v6, v1
10250; GFX10-NEXT:    v_mov_b32_e32 v5, v0
10251; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
10252; GFX10-NEXT:    s_waitcnt vmcnt(0)
10253; GFX10-NEXT:    buffer_gl1_inv
10254; GFX10-NEXT:    buffer_gl0_inv
10255; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
10256; GFX10-NEXT:    v_mov_b32_e32 v1, v5
10257; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
10258; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
10259; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
10260; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
10261; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
10262; GFX10-NEXT:    s_setpc_b64 s[30:31]
10263;
10264; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
10265; GFX90A:       ; %bb.0:
10266; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10267; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
10268; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
10269; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
10270; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
10271; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
10272; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
10273; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
10274; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
10275; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
10276; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
10277; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
10278; GFX90A-NEXT:    s_waitcnt vmcnt(0)
10279; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
10280; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
10281; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
10282; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
10283; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
10284; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
10285; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
10286; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
10287; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
10288; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
10289; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
10290; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
10291; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
10292; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
10293; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
10294; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
10295; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
10296; GFX90A-NEXT:    s_waitcnt vmcnt(0)
10297; GFX90A-NEXT:    buffer_wbinvl1
10298; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
10299; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
10300; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
10301; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
10302; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
10303; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
10304; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
10305; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10306;
10307; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
10308; GFX908:       ; %bb.0:
10309; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10310; GFX908-NEXT:    v_mov_b32_e32 v1, s20
10311; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
10312; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
10313; GFX908-NEXT:    s_mov_b64 s[6:7], 0
10314; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
10315; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
10316; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
10317; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
10318; GFX908-NEXT:    v_mov_b32_e32 v4, s4
10319; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
10320; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
10321; GFX908-NEXT:    s_waitcnt vmcnt(0)
10322; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
10323; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
10324; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
10325; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
10326; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
10327; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
10328; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
10329; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
10330; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
10331; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
10332; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
10333; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
10334; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
10335; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
10336; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
10337; GFX908-NEXT:    v_mov_b32_e32 v6, v1
10338; GFX908-NEXT:    v_mov_b32_e32 v5, v0
10339; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
10340; GFX908-NEXT:    s_waitcnt vmcnt(0)
10341; GFX908-NEXT:    buffer_wbinvl1
10342; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
10343; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
10344; GFX908-NEXT:    v_mov_b32_e32 v1, v5
10345; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
10346; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
10347; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
10348; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
10349; GFX908-NEXT:    s_setpc_b64 s[30:31]
10350;
10351; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
10352; GFX8:       ; %bb.0:
10353; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10354; GFX8-NEXT:    v_mov_b32_e32 v1, s20
10355; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
10356; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
10357; GFX8-NEXT:    s_mov_b64 s[6:7], 0
10358; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
10359; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
10360; GFX8-NEXT:    v_mov_b32_e32 v4, s4
10361; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
10362; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
10363; GFX8-NEXT:    s_waitcnt vmcnt(0)
10364; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
10365; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
10366; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
10367; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
10368; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
10369; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
10370; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
10371; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
10372; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
10373; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
10374; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
10375; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
10376; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
10377; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
10378; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
10379; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
10380; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
10381; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
10382; GFX8-NEXT:    v_mov_b32_e32 v6, v1
10383; GFX8-NEXT:    v_mov_b32_e32 v5, v0
10384; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
10385; GFX8-NEXT:    s_waitcnt vmcnt(0)
10386; GFX8-NEXT:    buffer_wbinvl1
10387; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
10388; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
10389; GFX8-NEXT:    v_mov_b32_e32 v1, v5
10390; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
10391; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
10392; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
10393; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
10394; GFX8-NEXT:    s_setpc_b64 s[30:31]
10395;
10396; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
10397; GFX7:       ; %bb.0:
10398; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10399; GFX7-NEXT:    v_mov_b32_e32 v2, s20
10400; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
10401; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
10402; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
10403; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
10404; GFX7-NEXT:    s_mov_b64 s[4:5], 0
10405; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
10406; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
10407; GFX7-NEXT:    s_waitcnt vmcnt(0)
10408; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
10409; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
10410; GFX7-NEXT:    v_mov_b32_e32 v2, s6
10411; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
10412; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
10413; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
10414; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
10415; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
10416; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
10417; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
10418; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
10419; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
10420; GFX7-NEXT:    v_alignbit_b32 v4, v3, v4, 16
10421; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
10422; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
10423; GFX7-NEXT:    v_mov_b32_e32 v6, v4
10424; GFX7-NEXT:    v_mov_b32_e32 v5, v3
10425; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
10426; GFX7-NEXT:    s_waitcnt vmcnt(0)
10427; GFX7-NEXT:    buffer_wbinvl1
10428; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
10429; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
10430; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10431; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
10432; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10433; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
10434; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
10435; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
10436; GFX7-NEXT:    s_setpc_b64 s[30:31]
10437;
10438; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
10439; GFX6:       ; %bb.0:
10440; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10441; GFX6-NEXT:    v_mov_b32_e32 v2, s20
10442; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
10443; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
10444; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
10445; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
10446; GFX6-NEXT:    s_mov_b64 s[4:5], 0
10447; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
10448; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
10449; GFX6-NEXT:    s_waitcnt vmcnt(0)
10450; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
10451; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
10452; GFX6-NEXT:    v_mov_b32_e32 v2, s6
10453; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
10454; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
10455; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
10456; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
10457; GFX6-NEXT:    s_waitcnt expcnt(0)
10458; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
10459; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
10460; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
10461; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
10462; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
10463; GFX6-NEXT:    v_alignbit_b32 v4, v3, v4, 16
10464; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
10465; GFX6-NEXT:    v_alignbit_b32 v3, v3, v5, 16
10466; GFX6-NEXT:    v_mov_b32_e32 v6, v4
10467; GFX6-NEXT:    v_mov_b32_e32 v5, v3
10468; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
10469; GFX6-NEXT:    s_waitcnt vmcnt(0)
10470; GFX6-NEXT:    buffer_wbinvl1
10471; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
10472; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
10473; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10474; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
10475; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10476; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
10477; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
10478; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
10479; GFX6-NEXT:    s_waitcnt expcnt(0)
10480; GFX6-NEXT:    s_setpc_b64 s[30:31]
10481  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
10482  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
10483  ret void
10484}
10485
10486define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
10487; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
10488; GFX12:       ; %bb.0:
10489; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10490; GFX12-NEXT:    s_wait_expcnt 0x0
10491; GFX12-NEXT:    s_wait_samplecnt 0x0
10492; GFX12-NEXT:    s_wait_bvhcnt 0x0
10493; GFX12-NEXT:    s_wait_kmcnt 0x0
10494; GFX12-NEXT:    v_mov_b32_e32 v1, s16
10495; GFX12-NEXT:    s_wait_storecnt 0x0
10496; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
10497; GFX12-NEXT:    s_wait_loadcnt 0x0
10498; GFX12-NEXT:    global_inv scope:SCOPE_DEV
10499; GFX12-NEXT:    s_setpc_b64 s[30:31]
10500;
10501; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
10502; GFX940:       ; %bb.0:
10503; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10504; GFX940-NEXT:    v_mov_b32_e32 v1, v0
10505; GFX940-NEXT:    v_mov_b32_e32 v0, s16
10506; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
10507; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
10508; GFX940-NEXT:    s_mov_b64 s[6:7], 0
10509; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
10510; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
10511; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
10512; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
10513; GFX940-NEXT:    v_mov_b32_e32 v4, s4
10514; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
10515; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
10516; GFX940-NEXT:    s_waitcnt vmcnt(0)
10517; GFX940-NEXT:    v_mov_b32_e32 v7, v0
10518; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
10519; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
10520; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
10521; GFX940-NEXT:    v_add_f32_e32 v1, v1, v3
10522; GFX940-NEXT:    v_bfe_u32 v5, v0, 16, 1
10523; GFX940-NEXT:    v_bfe_u32 v8, v1, 16, 1
10524; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v0
10525; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v1
10526; GFX940-NEXT:    v_add3_u32 v5, v5, v0, s8
10527; GFX940-NEXT:    v_add3_u32 v8, v8, v1, s8
10528; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
10529; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
10530; GFX940-NEXT:    buffer_wbl2 sc1
10531; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
10532; GFX940-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
10533; GFX940-NEXT:    v_perm_b32 v6, v1, v0, s9
10534; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
10535; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
10536; GFX940-NEXT:    s_waitcnt vmcnt(0)
10537; GFX940-NEXT:    buffer_inv sc1
10538; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
10539; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
10540; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
10541; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
10542; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
10543; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
10544; GFX940-NEXT:    s_setpc_b64 s[30:31]
10545;
10546; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
10547; GFX11:       ; %bb.0:
10548; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10549; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
10550; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
10551; GFX11-NEXT:    s_mov_b32 s5, 0
10552; GFX11-NEXT:    v_mov_b32_e32 v4, s4
10553; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
10554; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
10555; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
10556; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
10557; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
10558; GFX11-NEXT:    .p2align 6
10559; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
10560; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
10561; GFX11-NEXT:    s_waitcnt vmcnt(0)
10562; GFX11-NEXT:    v_mov_b32_e32 v6, v0
10563; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
10564; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10565; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
10566; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
10567; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10568; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
10569; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
10570; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
10571; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
10572; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10573; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
10574; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
10575; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10576; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
10577; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
10578; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
10579; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
10580; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10581; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
10582; GFX11-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
10583; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10584; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
10585; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
10586; GFX11-NEXT:    s_waitcnt vmcnt(0)
10587; GFX11-NEXT:    buffer_gl1_inv
10588; GFX11-NEXT:    buffer_gl0_inv
10589; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
10590; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
10591; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10592; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
10593; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
10594; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
10595; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
10596; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
10597; GFX11-NEXT:    s_setpc_b64 s[30:31]
10598;
10599; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
10600; GFX10:       ; %bb.0:
10601; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10602; GFX10-NEXT:    v_mov_b32_e32 v1, v0
10603; GFX10-NEXT:    v_mov_b32_e32 v0, s20
10604; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
10605; GFX10-NEXT:    s_mov_b32 s5, 0
10606; GFX10-NEXT:    v_mov_b32_e32 v4, s4
10607; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
10608; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
10609; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
10610; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
10611; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
10612; GFX10-NEXT:    s_waitcnt vmcnt(0)
10613; GFX10-NEXT:    v_mov_b32_e32 v6, v0
10614; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
10615; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
10616; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
10617; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
10618; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
10619; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
10620; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
10621; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
10622; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
10623; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
10624; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
10625; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
10626; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
10627; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
10628; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
10629; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
10630; GFX10-NEXT:    v_mov_b32_e32 v0, v5
10631; GFX10-NEXT:    v_mov_b32_e32 v1, v6
10632; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
10633; GFX10-NEXT:    s_waitcnt vmcnt(0)
10634; GFX10-NEXT:    buffer_gl1_inv
10635; GFX10-NEXT:    buffer_gl0_inv
10636; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
10637; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
10638; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
10639; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
10640; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
10641; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
10642; GFX10-NEXT:    s_setpc_b64 s[30:31]
10643;
10644; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
10645; GFX90A:       ; %bb.0:
10646; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10647; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
10648; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
10649; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
10650; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
10651; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
10652; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
10653; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
10654; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
10655; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
10656; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
10657; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
10658; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
10659; GFX90A-NEXT:    s_waitcnt vmcnt(0)
10660; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
10661; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
10662; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
10663; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
10664; GFX90A-NEXT:    v_add_f32_e32 v1, v1, v3
10665; GFX90A-NEXT:    v_bfe_u32 v5, v0, 16, 1
10666; GFX90A-NEXT:    v_bfe_u32 v8, v1, 16, 1
10667; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v0
10668; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v1
10669; GFX90A-NEXT:    v_add3_u32 v5, v5, v0, s8
10670; GFX90A-NEXT:    v_add3_u32 v8, v8, v1, s8
10671; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
10672; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
10673; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
10674; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
10675; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
10676; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
10677; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
10678; GFX90A-NEXT:    s_waitcnt vmcnt(0)
10679; GFX90A-NEXT:    buffer_wbinvl1
10680; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
10681; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
10682; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
10683; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
10684; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
10685; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
10686; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10687;
10688; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
10689; GFX908:       ; %bb.0:
10690; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10691; GFX908-NEXT:    v_mov_b32_e32 v1, v0
10692; GFX908-NEXT:    v_mov_b32_e32 v0, s20
10693; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
10694; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
10695; GFX908-NEXT:    s_mov_b64 s[6:7], 0
10696; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
10697; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
10698; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
10699; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
10700; GFX908-NEXT:    v_mov_b32_e32 v4, s4
10701; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
10702; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
10703; GFX908-NEXT:    s_waitcnt vmcnt(0)
10704; GFX908-NEXT:    v_mov_b32_e32 v6, v0
10705; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
10706; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
10707; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
10708; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
10709; GFX908-NEXT:    v_bfe_u32 v5, v0, 16, 1
10710; GFX908-NEXT:    v_bfe_u32 v8, v1, 16, 1
10711; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
10712; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v1
10713; GFX908-NEXT:    v_add3_u32 v5, v5, v0, s8
10714; GFX908-NEXT:    v_add3_u32 v8, v8, v1, s8
10715; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
10716; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
10717; GFX908-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
10718; GFX908-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
10719; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
10720; GFX908-NEXT:    v_mov_b32_e32 v0, v5
10721; GFX908-NEXT:    v_mov_b32_e32 v1, v6
10722; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
10723; GFX908-NEXT:    s_waitcnt vmcnt(0)
10724; GFX908-NEXT:    buffer_wbinvl1
10725; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
10726; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
10727; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
10728; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
10729; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
10730; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
10731; GFX908-NEXT:    s_setpc_b64 s[30:31]
10732;
10733; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
10734; GFX8:       ; %bb.0:
10735; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10736; GFX8-NEXT:    v_mov_b32_e32 v1, v0
10737; GFX8-NEXT:    v_mov_b32_e32 v0, s20
10738; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
10739; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
10740; GFX8-NEXT:    s_mov_b64 s[6:7], 0
10741; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
10742; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
10743; GFX8-NEXT:    v_mov_b32_e32 v4, s4
10744; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
10745; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
10746; GFX8-NEXT:    s_waitcnt vmcnt(0)
10747; GFX8-NEXT:    v_mov_b32_e32 v6, v0
10748; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
10749; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
10750; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
10751; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
10752; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
10753; GFX8-NEXT:    v_bfe_u32 v8, v1, 16, 1
10754; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
10755; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
10756; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
10757; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
10758; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
10759; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
10760; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
10761; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
10762; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
10763; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
10764; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
10765; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
10766; GFX8-NEXT:    v_mov_b32_e32 v0, v5
10767; GFX8-NEXT:    v_mov_b32_e32 v1, v6
10768; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
10769; GFX8-NEXT:    s_waitcnt vmcnt(0)
10770; GFX8-NEXT:    buffer_wbinvl1
10771; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
10772; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
10773; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
10774; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
10775; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
10776; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
10777; GFX8-NEXT:    s_setpc_b64 s[30:31]
10778;
10779; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
10780; GFX7:       ; %bb.0:
10781; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10782; GFX7-NEXT:    v_mov_b32_e32 v2, s20
10783; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
10784; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
10785; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
10786; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
10787; GFX7-NEXT:    s_mov_b64 s[4:5], 0
10788; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
10789; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
10790; GFX7-NEXT:    s_waitcnt vmcnt(0)
10791; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
10792; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
10793; GFX7-NEXT:    v_mov_b32_e32 v4, s6
10794; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
10795; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
10796; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
10797; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
10798; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
10799; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
10800; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
10801; GFX7-NEXT:    v_add_f32_e32 v6, v6, v3
10802; GFX7-NEXT:    v_add_f32_e32 v5, v5, v2
10803; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
10804; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
10805; GFX7-NEXT:    v_alignbit_b32 v0, v0, v5, 16
10806; GFX7-NEXT:    v_mov_b32_e32 v6, v1
10807; GFX7-NEXT:    v_mov_b32_e32 v5, v0
10808; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
10809; GFX7-NEXT:    s_waitcnt vmcnt(0)
10810; GFX7-NEXT:    buffer_wbinvl1
10811; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
10812; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
10813; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10814; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
10815; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10816; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
10817; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
10818; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
10819; GFX7-NEXT:    s_setpc_b64 s[30:31]
10820;
10821; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
10822; GFX6:       ; %bb.0:
10823; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10824; GFX6-NEXT:    v_mov_b32_e32 v2, s20
10825; GFX6-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
10826; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
10827; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
10828; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
10829; GFX6-NEXT:    s_mov_b64 s[4:5], 0
10830; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
10831; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
10832; GFX6-NEXT:    s_waitcnt vmcnt(0)
10833; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
10834; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
10835; GFX6-NEXT:    v_mov_b32_e32 v4, s6
10836; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
10837; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
10838; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
10839; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
10840; GFX6-NEXT:    s_waitcnt expcnt(0)
10841; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
10842; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
10843; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
10844; GFX6-NEXT:    v_add_f32_e32 v6, v6, v3
10845; GFX6-NEXT:    v_add_f32_e32 v5, v5, v2
10846; GFX6-NEXT:    v_alignbit_b32 v1, v1, v0, 16
10847; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
10848; GFX6-NEXT:    v_alignbit_b32 v0, v0, v5, 16
10849; GFX6-NEXT:    v_mov_b32_e32 v6, v1
10850; GFX6-NEXT:    v_mov_b32_e32 v5, v0
10851; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
10852; GFX6-NEXT:    s_waitcnt vmcnt(0)
10853; GFX6-NEXT:    buffer_wbinvl1
10854; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
10855; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
10856; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10857; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
10858; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10859; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
10860; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
10861; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
10862; GFX6-NEXT:    s_waitcnt expcnt(0)
10863; GFX6-NEXT:    s_setpc_b64 s[30:31]
10864  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
10865  %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
10866  ret <2 x bfloat> %result
10867}
10868
10869define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
10870; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
10871; GFX12:       ; %bb.0:
10872; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10873; GFX12-NEXT:    s_wait_expcnt 0x0
10874; GFX12-NEXT:    s_wait_samplecnt 0x0
10875; GFX12-NEXT:    s_wait_bvhcnt 0x0
10876; GFX12-NEXT:    s_wait_kmcnt 0x0
10877; GFX12-NEXT:    v_mov_b32_e32 v1, s16
10878; GFX12-NEXT:    s_wait_storecnt 0x0
10879; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
10880; GFX12-NEXT:    s_wait_storecnt 0x0
10881; GFX12-NEXT:    global_inv scope:SCOPE_DEV
10882; GFX12-NEXT:    s_setpc_b64 s[30:31]
10883;
10884; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
10885; GFX940:       ; %bb.0:
10886; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10887; GFX940-NEXT:    v_mov_b32_e32 v1, s16
10888; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
10889; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
10890; GFX940-NEXT:    s_mov_b64 s[6:7], 0
10891; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
10892; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
10893; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
10894; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
10895; GFX940-NEXT:    v_mov_b32_e32 v4, s4
10896; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
10897; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
10898; GFX940-NEXT:    s_waitcnt vmcnt(0)
10899; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
10900; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
10901; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
10902; GFX940-NEXT:    v_add_f32_e32 v5, v5, v3
10903; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
10904; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
10905; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
10906; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
10907; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
10908; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
10909; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
10910; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
10911; GFX940-NEXT:    buffer_wbl2 sc1
10912; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
10913; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
10914; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
10915; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
10916; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
10917; GFX940-NEXT:    s_waitcnt vmcnt(0)
10918; GFX940-NEXT:    buffer_inv sc1
10919; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
10920; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
10921; GFX940-NEXT:    v_mov_b32_e32 v1, v6
10922; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
10923; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
10924; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
10925; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
10926; GFX940-NEXT:    s_setpc_b64 s[30:31]
10927;
10928; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
10929; GFX11:       ; %bb.0:
10930; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10931; GFX11-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
10932; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
10933; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10934; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
10935; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
10936; GFX11-NEXT:    s_mov_b32 s5, 0
10937; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
10938; GFX11-NEXT:    .p2align 6
10939; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
10940; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
10941; GFX11-NEXT:    s_waitcnt vmcnt(0)
10942; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
10943; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
10944; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
10945; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10946; GFX11-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
10947; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
10948; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
10949; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
10950; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
10951; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
10952; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
10953; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
10954; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
10955; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
10956; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
10957; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
10958; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
10959; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10960; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
10961; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
10962; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
10963; GFX11-NEXT:    s_waitcnt vmcnt(0)
10964; GFX11-NEXT:    buffer_gl1_inv
10965; GFX11-NEXT:    buffer_gl0_inv
10966; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
10967; GFX11-NEXT:    v_mov_b32_e32 v1, v5
10968; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
10969; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10970; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
10971; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
10972; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
10973; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
10974; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
10975; GFX11-NEXT:    s_setpc_b64 s[30:31]
10976;
10977; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
10978; GFX10:       ; %bb.0:
10979; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10980; GFX10-NEXT:    v_mov_b32_e32 v1, s20
10981; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
10982; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
10983; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
10984; GFX10-NEXT:    v_mov_b32_e32 v4, s4
10985; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
10986; GFX10-NEXT:    s_mov_b32 s5, 0
10987; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
10988; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
10989; GFX10-NEXT:    s_waitcnt vmcnt(0)
10990; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
10991; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
10992; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
10993; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
10994; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
10995; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
10996; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
10997; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
10998; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
10999; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11000; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
11001; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11002; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
11003; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
11004; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
11005; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
11006; GFX10-NEXT:    v_mov_b32_e32 v6, v1
11007; GFX10-NEXT:    v_mov_b32_e32 v5, v0
11008; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
11009; GFX10-NEXT:    s_waitcnt vmcnt(0)
11010; GFX10-NEXT:    buffer_gl1_inv
11011; GFX10-NEXT:    buffer_gl0_inv
11012; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
11013; GFX10-NEXT:    v_mov_b32_e32 v1, v5
11014; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
11015; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
11016; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
11017; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
11018; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
11019; GFX10-NEXT:    s_setpc_b64 s[30:31]
11020;
11021; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
11022; GFX90A:       ; %bb.0:
11023; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11024; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
11025; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
11026; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
11027; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
11028; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
11029; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
11030; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
11031; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
11032; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
11033; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
11034; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
11035; GFX90A-NEXT:    s_waitcnt vmcnt(0)
11036; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
11037; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
11038; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
11039; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
11040; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
11041; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
11042; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
11043; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11044; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
11045; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
11046; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11047; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
11048; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
11049; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
11050; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
11051; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
11052; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
11053; GFX90A-NEXT:    s_waitcnt vmcnt(0)
11054; GFX90A-NEXT:    buffer_wbinvl1
11055; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
11056; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
11057; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
11058; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
11059; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
11060; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
11061; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
11062; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11063;
11064; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
11065; GFX908:       ; %bb.0:
11066; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11067; GFX908-NEXT:    v_mov_b32_e32 v1, s20
11068; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
11069; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
11070; GFX908-NEXT:    s_mov_b64 s[6:7], 0
11071; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
11072; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
11073; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
11074; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
11075; GFX908-NEXT:    v_mov_b32_e32 v4, s4
11076; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
11077; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
11078; GFX908-NEXT:    s_waitcnt vmcnt(0)
11079; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
11080; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
11081; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
11082; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
11083; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
11084; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
11085; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
11086; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11087; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
11088; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
11089; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11090; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
11091; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
11092; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
11093; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
11094; GFX908-NEXT:    v_mov_b32_e32 v6, v1
11095; GFX908-NEXT:    v_mov_b32_e32 v5, v0
11096; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
11097; GFX908-NEXT:    s_waitcnt vmcnt(0)
11098; GFX908-NEXT:    buffer_wbinvl1
11099; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
11100; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
11101; GFX908-NEXT:    v_mov_b32_e32 v1, v5
11102; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
11103; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
11104; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
11105; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
11106; GFX908-NEXT:    s_setpc_b64 s[30:31]
11107;
11108; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
11109; GFX8:       ; %bb.0:
11110; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11111; GFX8-NEXT:    v_mov_b32_e32 v1, s20
11112; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
11113; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
11114; GFX8-NEXT:    s_mov_b64 s[6:7], 0
11115; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
11116; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
11117; GFX8-NEXT:    v_mov_b32_e32 v4, s4
11118; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
11119; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
11120; GFX8-NEXT:    s_waitcnt vmcnt(0)
11121; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
11122; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
11123; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
11124; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
11125; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
11126; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
11127; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
11128; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
11129; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
11130; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
11131; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11132; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11133; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
11134; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
11135; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
11136; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
11137; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11138; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
11139; GFX8-NEXT:    v_mov_b32_e32 v6, v1
11140; GFX8-NEXT:    v_mov_b32_e32 v5, v0
11141; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
11142; GFX8-NEXT:    s_waitcnt vmcnt(0)
11143; GFX8-NEXT:    buffer_wbinvl1
11144; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
11145; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
11146; GFX8-NEXT:    v_mov_b32_e32 v1, v5
11147; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
11148; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
11149; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
11150; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
11151; GFX8-NEXT:    s_setpc_b64 s[30:31]
11152;
11153; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
11154; GFX7:       ; %bb.0:
11155; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11156; GFX7-NEXT:    v_mov_b32_e32 v2, s20
11157; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
11158; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
11159; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
11160; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
11161; GFX7-NEXT:    s_mov_b64 s[4:5], 0
11162; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
11163; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
11164; GFX7-NEXT:    s_waitcnt vmcnt(0)
11165; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
11166; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11167; GFX7-NEXT:    v_mov_b32_e32 v2, s6
11168; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
11169; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
11170; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
11171; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
11172; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
11173; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
11174; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
11175; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
11176; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
11177; GFX7-NEXT:    v_alignbit_b32 v4, v3, v4, 16
11178; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
11179; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
11180; GFX7-NEXT:    v_mov_b32_e32 v6, v4
11181; GFX7-NEXT:    v_mov_b32_e32 v5, v3
11182; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
11183; GFX7-NEXT:    s_waitcnt vmcnt(0)
11184; GFX7-NEXT:    buffer_wbinvl1
11185; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
11186; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
11187; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11188; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
11189; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11190; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
11191; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
11192; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
11193; GFX7-NEXT:    s_setpc_b64 s[30:31]
11194;
11195; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
11196; GFX6:       ; %bb.0:
11197; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11198; GFX6-NEXT:    v_mov_b32_e32 v2, s20
11199; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
11200; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
11201; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
11202; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
11203; GFX6-NEXT:    s_mov_b64 s[4:5], 0
11204; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
11205; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
11206; GFX6-NEXT:    s_waitcnt vmcnt(0)
11207; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
11208; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11209; GFX6-NEXT:    v_mov_b32_e32 v2, s6
11210; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
11211; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
11212; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
11213; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
11214; GFX6-NEXT:    s_waitcnt expcnt(0)
11215; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
11216; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
11217; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
11218; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
11219; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
11220; GFX6-NEXT:    v_alignbit_b32 v4, v3, v4, 16
11221; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
11222; GFX6-NEXT:    v_alignbit_b32 v3, v3, v5, 16
11223; GFX6-NEXT:    v_mov_b32_e32 v6, v4
11224; GFX6-NEXT:    v_mov_b32_e32 v5, v3
11225; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
11226; GFX6-NEXT:    s_waitcnt vmcnt(0)
11227; GFX6-NEXT:    buffer_wbinvl1
11228; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
11229; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
11230; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11231; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
11232; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11233; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
11234; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
11235; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
11236; GFX6-NEXT:    s_waitcnt expcnt(0)
11237; GFX6-NEXT:    s_setpc_b64 s[30:31]
11238  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
11239  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
11240  ret void
11241}
11242
11243define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
11244; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
11245; GFX12:       ; %bb.0:
11246; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11247; GFX12-NEXT:    s_wait_expcnt 0x0
11248; GFX12-NEXT:    s_wait_samplecnt 0x0
11249; GFX12-NEXT:    s_wait_bvhcnt 0x0
11250; GFX12-NEXT:    s_wait_kmcnt 0x0
11251; GFX12-NEXT:    v_mov_b32_e32 v1, s16
11252; GFX12-NEXT:    s_wait_storecnt 0x0
11253; GFX12-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
11254; GFX12-NEXT:    s_wait_storecnt 0x0
11255; GFX12-NEXT:    global_inv scope:SCOPE_DEV
11256; GFX12-NEXT:    s_setpc_b64 s[30:31]
11257;
11258; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
11259; GFX940:       ; %bb.0:
11260; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11261; GFX940-NEXT:    v_mov_b32_e32 v1, s16
11262; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
11263; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
11264; GFX940-NEXT:    s_mov_b64 s[6:7], 0
11265; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
11266; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
11267; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
11268; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
11269; GFX940-NEXT:    v_mov_b32_e32 v4, s4
11270; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
11271; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
11272; GFX940-NEXT:    s_waitcnt vmcnt(0)
11273; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
11274; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
11275; GFX940-NEXT:    v_add_f32_e32 v0, v0, v2
11276; GFX940-NEXT:    v_add_f32_e32 v5, v5, v3
11277; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
11278; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
11279; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
11280; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11281; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
11282; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
11283; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11284; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
11285; GFX940-NEXT:    buffer_wbl2 sc1
11286; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
11287; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
11288; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
11289; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
11290; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
11291; GFX940-NEXT:    s_waitcnt vmcnt(0)
11292; GFX940-NEXT:    buffer_inv sc1
11293; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
11294; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
11295; GFX940-NEXT:    v_mov_b32_e32 v1, v6
11296; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
11297; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
11298; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
11299; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
11300; GFX940-NEXT:    s_setpc_b64 s[30:31]
11301;
11302; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
11303; GFX11:       ; %bb.0:
11304; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11305; GFX11-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
11306; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
11307; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11308; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
11309; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
11310; GFX11-NEXT:    s_mov_b32 s5, 0
11311; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
11312; GFX11-NEXT:    .p2align 6
11313; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
11314; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
11315; GFX11-NEXT:    s_waitcnt vmcnt(0)
11316; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
11317; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
11318; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
11319; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11320; GFX11-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
11321; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
11322; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
11323; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
11324; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
11325; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11326; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11327; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11328; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
11329; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
11330; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
11331; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
11332; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
11333; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11334; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
11335; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
11336; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
11337; GFX11-NEXT:    s_waitcnt vmcnt(0)
11338; GFX11-NEXT:    buffer_gl1_inv
11339; GFX11-NEXT:    buffer_gl0_inv
11340; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
11341; GFX11-NEXT:    v_mov_b32_e32 v1, v5
11342; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
11343; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11344; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
11345; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
11346; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
11347; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
11348; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
11349; GFX11-NEXT:    s_setpc_b64 s[30:31]
11350;
11351; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
11352; GFX10:       ; %bb.0:
11353; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11354; GFX10-NEXT:    v_mov_b32_e32 v1, s20
11355; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
11356; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
11357; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
11358; GFX10-NEXT:    v_mov_b32_e32 v4, s4
11359; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
11360; GFX10-NEXT:    s_mov_b32 s5, 0
11361; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
11362; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
11363; GFX10-NEXT:    s_waitcnt vmcnt(0)
11364; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
11365; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
11366; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
11367; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
11368; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
11369; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
11370; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
11371; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
11372; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11373; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11374; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
11375; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11376; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
11377; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
11378; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
11379; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
11380; GFX10-NEXT:    v_mov_b32_e32 v6, v1
11381; GFX10-NEXT:    v_mov_b32_e32 v5, v0
11382; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
11383; GFX10-NEXT:    s_waitcnt vmcnt(0)
11384; GFX10-NEXT:    buffer_gl1_inv
11385; GFX10-NEXT:    buffer_gl0_inv
11386; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
11387; GFX10-NEXT:    v_mov_b32_e32 v1, v5
11388; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
11389; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
11390; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
11391; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
11392; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
11393; GFX10-NEXT:    s_setpc_b64 s[30:31]
11394;
11395; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
11396; GFX90A:       ; %bb.0:
11397; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11398; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
11399; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
11400; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
11401; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
11402; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
11403; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
11404; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
11405; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
11406; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
11407; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
11408; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
11409; GFX90A-NEXT:    s_waitcnt vmcnt(0)
11410; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
11411; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
11412; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
11413; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
11414; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
11415; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
11416; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
11417; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11418; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
11419; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
11420; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11421; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
11422; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
11423; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
11424; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
11425; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
11426; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
11427; GFX90A-NEXT:    s_waitcnt vmcnt(0)
11428; GFX90A-NEXT:    buffer_wbinvl1
11429; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
11430; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
11431; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
11432; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
11433; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
11434; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
11435; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
11436; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11437;
11438; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
11439; GFX908:       ; %bb.0:
11440; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11441; GFX908-NEXT:    v_mov_b32_e32 v1, s20
11442; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
11443; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
11444; GFX908-NEXT:    s_mov_b64 s[6:7], 0
11445; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
11446; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
11447; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
11448; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
11449; GFX908-NEXT:    v_mov_b32_e32 v4, s4
11450; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
11451; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
11452; GFX908-NEXT:    s_waitcnt vmcnt(0)
11453; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
11454; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
11455; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
11456; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
11457; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
11458; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
11459; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
11460; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11461; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
11462; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
11463; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11464; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
11465; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
11466; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
11467; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
11468; GFX908-NEXT:    v_mov_b32_e32 v6, v1
11469; GFX908-NEXT:    v_mov_b32_e32 v5, v0
11470; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
11471; GFX908-NEXT:    s_waitcnt vmcnt(0)
11472; GFX908-NEXT:    buffer_wbinvl1
11473; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
11474; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
11475; GFX908-NEXT:    v_mov_b32_e32 v1, v5
11476; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
11477; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
11478; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
11479; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
11480; GFX908-NEXT:    s_setpc_b64 s[30:31]
11481;
11482; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
11483; GFX8:       ; %bb.0:
11484; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11485; GFX8-NEXT:    v_mov_b32_e32 v1, s20
11486; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
11487; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
11488; GFX8-NEXT:    s_mov_b64 s[6:7], 0
11489; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
11490; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
11491; GFX8-NEXT:    v_mov_b32_e32 v4, s4
11492; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
11493; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
11494; GFX8-NEXT:    s_waitcnt vmcnt(0)
11495; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
11496; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
11497; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
11498; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
11499; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
11500; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
11501; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
11502; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
11503; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
11504; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
11505; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11506; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11507; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
11508; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
11509; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
11510; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
11511; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11512; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
11513; GFX8-NEXT:    v_mov_b32_e32 v6, v1
11514; GFX8-NEXT:    v_mov_b32_e32 v5, v0
11515; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
11516; GFX8-NEXT:    s_waitcnt vmcnt(0)
11517; GFX8-NEXT:    buffer_wbinvl1
11518; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
11519; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
11520; GFX8-NEXT:    v_mov_b32_e32 v1, v5
11521; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
11522; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
11523; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
11524; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
11525; GFX8-NEXT:    s_setpc_b64 s[30:31]
11526;
11527; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
11528; GFX7:       ; %bb.0:
11529; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11530; GFX7-NEXT:    v_mov_b32_e32 v2, s20
11531; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
11532; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
11533; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
11534; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
11535; GFX7-NEXT:    s_mov_b64 s[4:5], 0
11536; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
11537; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
11538; GFX7-NEXT:    s_waitcnt vmcnt(0)
11539; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
11540; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11541; GFX7-NEXT:    v_mov_b32_e32 v2, s6
11542; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
11543; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
11544; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
11545; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
11546; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
11547; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
11548; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
11549; GFX7-NEXT:    v_add_f32_e32 v6, v6, v1
11550; GFX7-NEXT:    v_add_f32_e32 v5, v5, v0
11551; GFX7-NEXT:    v_alignbit_b32 v4, v3, v4, 16
11552; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
11553; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
11554; GFX7-NEXT:    v_mov_b32_e32 v6, v4
11555; GFX7-NEXT:    v_mov_b32_e32 v5, v3
11556; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
11557; GFX7-NEXT:    s_waitcnt vmcnt(0)
11558; GFX7-NEXT:    buffer_wbinvl1
11559; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
11560; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
11561; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11562; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
11563; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11564; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
11565; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
11566; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
11567; GFX7-NEXT:    s_setpc_b64 s[30:31]
11568;
11569; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
11570; GFX6:       ; %bb.0:
11571; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11572; GFX6-NEXT:    v_mov_b32_e32 v2, s20
11573; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
11574; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
11575; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
11576; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
11577; GFX6-NEXT:    s_mov_b64 s[4:5], 0
11578; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
11579; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
11580; GFX6-NEXT:    s_waitcnt vmcnt(0)
11581; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
11582; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11583; GFX6-NEXT:    v_mov_b32_e32 v2, s6
11584; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
11585; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
11586; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
11587; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
11588; GFX6-NEXT:    s_waitcnt expcnt(0)
11589; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
11590; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
11591; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
11592; GFX6-NEXT:    v_add_f32_e32 v6, v6, v1
11593; GFX6-NEXT:    v_add_f32_e32 v5, v5, v0
11594; GFX6-NEXT:    v_alignbit_b32 v4, v3, v4, 16
11595; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
11596; GFX6-NEXT:    v_alignbit_b32 v3, v3, v5, 16
11597; GFX6-NEXT:    v_mov_b32_e32 v6, v4
11598; GFX6-NEXT:    v_mov_b32_e32 v5, v3
11599; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
11600; GFX6-NEXT:    s_waitcnt vmcnt(0)
11601; GFX6-NEXT:    buffer_wbinvl1
11602; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
11603; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
11604; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11605; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
11606; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11607; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
11608; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
11609; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
11610; GFX6-NEXT:    s_waitcnt expcnt(0)
11611; GFX6-NEXT:    s_setpc_b64 s[30:31]
11612  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
11613  %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
11614  ret void
11615}
11616
11617; --------------------------------------------------------------------
11618; misc
11619; --------------------------------------------------------------------
11620
11621define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
11622; GFX12-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
11623; GFX12:       ; %bb.0:
11624; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11625; GFX12-NEXT:    s_wait_expcnt 0x0
11626; GFX12-NEXT:    s_wait_samplecnt 0x0
11627; GFX12-NEXT:    s_wait_bvhcnt 0x0
11628; GFX12-NEXT:    s_wait_kmcnt 0x0
11629; GFX12-NEXT:    v_mov_b32_e32 v1, s16
11630; GFX12-NEXT:    global_wb scope:SCOPE_SYS
11631; GFX12-NEXT:    s_wait_storecnt 0x0
11632; GFX12-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
11633; GFX12-NEXT:    s_wait_loadcnt 0x0
11634; GFX12-NEXT:    global_inv scope:SCOPE_SYS
11635; GFX12-NEXT:    s_setpc_b64 s[30:31]
11636;
11637; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
11638; GFX940:       ; %bb.0:
11639; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11640; GFX940-NEXT:    v_mov_b32_e32 v1, s16
11641; GFX940-NEXT:    buffer_wbl2 sc0 sc1
11642; GFX940-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
11643; GFX940-NEXT:    s_waitcnt vmcnt(0)
11644; GFX940-NEXT:    buffer_inv sc0 sc1
11645; GFX940-NEXT:    s_setpc_b64 s[30:31]
11646;
11647; GFX11-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
11648; GFX11:       ; %bb.0:
11649; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11650; GFX11-NEXT:    v_mov_b32_e32 v1, s16
11651; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
11652; GFX11-NEXT:    buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
11653; GFX11-NEXT:    s_waitcnt vmcnt(0)
11654; GFX11-NEXT:    buffer_gl1_inv
11655; GFX11-NEXT:    buffer_gl0_inv
11656; GFX11-NEXT:    s_setpc_b64 s[30:31]
11657;
11658; GFX10-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
11659; GFX10:       ; %bb.0:
11660; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11661; GFX10-NEXT:    v_mov_b32_e32 v2, v0
11662; GFX10-NEXT:    v_mov_b32_e32 v0, s20
11663; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
11664; GFX10-NEXT:    v_mov_b32_e32 v3, s4
11665; GFX10-NEXT:    s_mov_b32 s4, 0
11666; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
11667; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
11668; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
11669; GFX10-NEXT:    s_waitcnt vmcnt(0)
11670; GFX10-NEXT:    v_mov_b32_e32 v5, v0
11671; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
11672; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
11673; GFX10-NEXT:    v_mov_b32_e32 v0, v4
11674; GFX10-NEXT:    v_mov_b32_e32 v1, v5
11675; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
11676; GFX10-NEXT:    s_waitcnt vmcnt(0)
11677; GFX10-NEXT:    buffer_gl1_inv
11678; GFX10-NEXT:    buffer_gl0_inv
11679; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
11680; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
11681; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
11682; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
11683; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
11684; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
11685; GFX10-NEXT:    s_setpc_b64 s[30:31]
11686;
11687; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
11688; GFX90A:       ; %bb.0:
11689; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11690; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
11691; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
11692; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
11693; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
11694; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
11695; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
11696; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
11697; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
11698; GFX90A-NEXT:    s_waitcnt vmcnt(0)
11699; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
11700; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
11701; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
11702; GFX90A-NEXT:    buffer_wbl2
11703; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
11704; GFX90A-NEXT:    s_waitcnt vmcnt(0)
11705; GFX90A-NEXT:    buffer_invl2
11706; GFX90A-NEXT:    buffer_wbinvl1
11707; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
11708; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11709; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11710; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
11711; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
11712; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
11713; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11714;
11715; GFX908-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
11716; GFX908:       ; %bb.0:
11717; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11718; GFX908-NEXT:    v_mov_b32_e32 v2, v0
11719; GFX908-NEXT:    v_mov_b32_e32 v0, s20
11720; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
11721; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
11722; GFX908-NEXT:    s_mov_b64 s[4:5], 0
11723; GFX908-NEXT:    v_mov_b32_e32 v3, s6
11724; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
11725; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
11726; GFX908-NEXT:    s_waitcnt vmcnt(0)
11727; GFX908-NEXT:    v_mov_b32_e32 v5, v0
11728; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
11729; GFX908-NEXT:    v_mov_b32_e32 v0, v4
11730; GFX908-NEXT:    v_mov_b32_e32 v1, v5
11731; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
11732; GFX908-NEXT:    s_waitcnt vmcnt(0)
11733; GFX908-NEXT:    buffer_wbinvl1
11734; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
11735; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11736; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11737; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
11738; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
11739; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
11740; GFX908-NEXT:    s_setpc_b64 s[30:31]
11741;
11742; GFX8-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
11743; GFX8:       ; %bb.0:
11744; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11745; GFX8-NEXT:    v_mov_b32_e32 v2, v0
11746; GFX8-NEXT:    v_mov_b32_e32 v0, s20
11747; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
11748; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
11749; GFX8-NEXT:    s_mov_b64 s[4:5], 0
11750; GFX8-NEXT:    v_mov_b32_e32 v3, s6
11751; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
11752; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
11753; GFX8-NEXT:    s_waitcnt vmcnt(0)
11754; GFX8-NEXT:    v_mov_b32_e32 v5, v0
11755; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
11756; GFX8-NEXT:    v_mov_b32_e32 v0, v4
11757; GFX8-NEXT:    v_mov_b32_e32 v1, v5
11758; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
11759; GFX8-NEXT:    s_waitcnt vmcnt(0)
11760; GFX8-NEXT:    buffer_wbinvl1
11761; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
11762; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11763; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11764; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
11765; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
11766; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
11767; GFX8-NEXT:    s_setpc_b64 s[30:31]
11768;
11769; GFX7-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
11770; GFX7:       ; %bb.0:
11771; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11772; GFX7-NEXT:    v_mov_b32_e32 v2, v0
11773; GFX7-NEXT:    v_mov_b32_e32 v0, s20
11774; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
11775; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
11776; GFX7-NEXT:    s_mov_b64 s[4:5], 0
11777; GFX7-NEXT:    v_mov_b32_e32 v3, s6
11778; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
11779; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
11780; GFX7-NEXT:    s_waitcnt vmcnt(0)
11781; GFX7-NEXT:    v_mov_b32_e32 v5, v0
11782; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
11783; GFX7-NEXT:    v_mov_b32_e32 v0, v4
11784; GFX7-NEXT:    v_mov_b32_e32 v1, v5
11785; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
11786; GFX7-NEXT:    s_waitcnt vmcnt(0)
11787; GFX7-NEXT:    buffer_wbinvl1
11788; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
11789; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11790; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11791; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
11792; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
11793; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
11794; GFX7-NEXT:    s_setpc_b64 s[30:31]
11795;
11796; GFX6-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
11797; GFX6:       ; %bb.0:
11798; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11799; GFX6-NEXT:    v_mov_b32_e32 v2, v0
11800; GFX6-NEXT:    v_mov_b32_e32 v0, s20
11801; GFX6-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
11802; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
11803; GFX6-NEXT:    s_mov_b64 s[4:5], 0
11804; GFX6-NEXT:    v_mov_b32_e32 v3, s6
11805; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
11806; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
11807; GFX6-NEXT:    s_waitcnt vmcnt(0)
11808; GFX6-NEXT:    v_mov_b32_e32 v5, v0
11809; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
11810; GFX6-NEXT:    s_waitcnt expcnt(0)
11811; GFX6-NEXT:    v_mov_b32_e32 v0, v4
11812; GFX6-NEXT:    v_mov_b32_e32 v1, v5
11813; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
11814; GFX6-NEXT:    s_waitcnt vmcnt(0)
11815; GFX6-NEXT:    buffer_wbinvl1
11816; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
11817; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11818; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11819; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
11820; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
11821; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
11822; GFX6-NEXT:    s_waitcnt expcnt(0)
11823; GFX6-NEXT:    s_setpc_b64 s[30:31]
11824  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
11825  %result = atomicrmw fadd ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
11826  ret float %result
11827}
11828
11829attributes #0 = { nounwind }
11830
11831!0 = !{}
11832