xref: /llvm-project/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (revision eeac0ffaf46cf9f9b0f680b9940cc4b68a0286d8)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
7; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
8; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
9; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
10; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
11
12; --------------------------------------------------------------------
13; float
14; --------------------------------------------------------------------
15
16define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
17; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
18; GFX12:       ; %bb.0:
19; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
20; GFX12-NEXT:    s_wait_expcnt 0x0
21; GFX12-NEXT:    s_wait_samplecnt 0x0
22; GFX12-NEXT:    s_wait_bvhcnt 0x0
23; GFX12-NEXT:    s_wait_kmcnt 0x0
24; GFX12-NEXT:    v_mov_b32_e32 v1, s16
25; GFX12-NEXT:    s_wait_storecnt 0x0
26; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
27; GFX12-NEXT:    s_wait_loadcnt 0x0
28; GFX12-NEXT:    global_inv scope:SCOPE_DEV
29; GFX12-NEXT:    s_setpc_b64 s[30:31]
30;
31; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
32; GFX940:       ; %bb.0:
33; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34; GFX940-NEXT:    v_mov_b32_e32 v1, v0
35; GFX940-NEXT:    v_mov_b32_e32 v0, s16
36; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
37; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
38; GFX940-NEXT:    s_mov_b64 s[4:5], 0
39; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
40; GFX940-NEXT:    v_mov_b32_e32 v3, s6
41; GFX940-NEXT:  .LBB0_1: ; %atomicrmw.start
42; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
43; GFX940-NEXT:    s_waitcnt vmcnt(0)
44; GFX940-NEXT:    v_mov_b32_e32 v5, v0
45; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
46; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
47; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
48; GFX940-NEXT:    buffer_wbl2 sc1
49; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
50; GFX940-NEXT:    s_waitcnt vmcnt(0)
51; GFX940-NEXT:    buffer_inv sc1
52; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
53; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
54; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
55; GFX940-NEXT:    s_cbranch_execnz .LBB0_1
56; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
57; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
58; GFX940-NEXT:    s_setpc_b64 s[30:31]
59;
60; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
61; GFX11:       ; %bb.0:
62; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63; GFX11-NEXT:    v_mov_b32_e32 v1, s16
64; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
65; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
66; GFX11-NEXT:    s_waitcnt vmcnt(0)
67; GFX11-NEXT:    buffer_gl1_inv
68; GFX11-NEXT:    buffer_gl0_inv
69; GFX11-NEXT:    s_setpc_b64 s[30:31]
70;
71; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
72; GFX10:       ; %bb.0:
73; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74; GFX10-NEXT:    v_mov_b32_e32 v1, s20
75; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
76; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc
77; GFX10-NEXT:    s_waitcnt vmcnt(0)
78; GFX10-NEXT:    buffer_gl1_inv
79; GFX10-NEXT:    buffer_gl0_inv
80; GFX10-NEXT:    s_setpc_b64 s[30:31]
81;
82; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
83; GFX90A:       ; %bb.0:
84; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
86; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
87; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
88; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
89; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
90; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
91; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
92; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
93; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
94; GFX90A-NEXT:    s_waitcnt vmcnt(0)
95; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
96; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
97; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
98; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
99; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
100; GFX90A-NEXT:    s_waitcnt vmcnt(0)
101; GFX90A-NEXT:    buffer_wbinvl1
102; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
103; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
104; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
105; GFX90A-NEXT:    s_cbranch_execnz .LBB0_1
106; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
107; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
108; GFX90A-NEXT:    s_setpc_b64 s[30:31]
109;
110; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
111; GFX908:       ; %bb.0:
112; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113; GFX908-NEXT:    v_mov_b32_e32 v1, v0
114; GFX908-NEXT:    v_mov_b32_e32 v0, s20
115; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
116; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
117; GFX908-NEXT:    s_mov_b64 s[4:5], 0
118; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
119; GFX908-NEXT:    v_mov_b32_e32 v3, s6
120; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
121; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
122; GFX908-NEXT:    s_waitcnt vmcnt(0)
123; GFX908-NEXT:    v_mov_b32_e32 v5, v0
124; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
125; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
126; GFX908-NEXT:    v_mov_b32_e32 v0, v4
127; GFX908-NEXT:    v_mov_b32_e32 v1, v5
128; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
129; GFX908-NEXT:    s_waitcnt vmcnt(0)
130; GFX908-NEXT:    buffer_wbinvl1
131; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
132; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
133; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
134; GFX908-NEXT:    s_cbranch_execnz .LBB0_1
135; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
136; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
137; GFX908-NEXT:    s_setpc_b64 s[30:31]
138;
139; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
140; GFX8:       ; %bb.0:
141; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142; GFX8-NEXT:    v_mov_b32_e32 v1, v0
143; GFX8-NEXT:    v_mov_b32_e32 v0, s20
144; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
145; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
146; GFX8-NEXT:    s_mov_b64 s[4:5], 0
147; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
148; GFX8-NEXT:    v_mov_b32_e32 v3, s6
149; GFX8-NEXT:  .LBB0_1: ; %atomicrmw.start
150; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
151; GFX8-NEXT:    s_waitcnt vmcnt(0)
152; GFX8-NEXT:    v_mov_b32_e32 v5, v0
153; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
154; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
155; GFX8-NEXT:    v_mov_b32_e32 v0, v4
156; GFX8-NEXT:    v_mov_b32_e32 v1, v5
157; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
158; GFX8-NEXT:    s_waitcnt vmcnt(0)
159; GFX8-NEXT:    buffer_wbinvl1
160; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
161; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
162; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
163; GFX8-NEXT:    s_cbranch_execnz .LBB0_1
164; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
165; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
166; GFX8-NEXT:    s_setpc_b64 s[30:31]
167;
168; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
169; GFX7:       ; %bb.0:
170; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171; GFX7-NEXT:    v_mov_b32_e32 v1, s20
172; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc
173; GFX7-NEXT:    s_waitcnt vmcnt(0)
174; GFX7-NEXT:    buffer_wbinvl1
175; GFX7-NEXT:    s_setpc_b64 s[30:31]
176;
177; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
178; GFX6:       ; %bb.0:
179; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180; GFX6-NEXT:    v_mov_b32_e32 v1, s20
181; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc
182; GFX6-NEXT:    s_waitcnt vmcnt(0)
183; GFX6-NEXT:    buffer_wbinvl1
184; GFX6-NEXT:    s_waitcnt expcnt(0)
185; GFX6-NEXT:    s_setpc_b64 s[30:31]
186  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
187  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
188  ret float %result
189}
190
191define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
192; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
193; GFX12:       ; %bb.0:
194; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
195; GFX12-NEXT:    s_wait_expcnt 0x0
196; GFX12-NEXT:    s_wait_samplecnt 0x0
197; GFX12-NEXT:    s_wait_bvhcnt 0x0
198; GFX12-NEXT:    s_wait_kmcnt 0x0
199; GFX12-NEXT:    v_mov_b32_e32 v1, s16
200; GFX12-NEXT:    s_wait_storecnt 0x0
201; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024
202; GFX12-NEXT:    s_wait_storecnt 0x0
203; GFX12-NEXT:    global_inv scope:SCOPE_DEV
204; GFX12-NEXT:    s_setpc_b64 s[30:31]
205;
206; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
207; GFX940:       ; %bb.0:
208; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
209; GFX940-NEXT:    v_mov_b32_e32 v1, s16
210; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
211; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
212; GFX940-NEXT:    s_mov_b64 s[4:5], 0
213; GFX940-NEXT:    v_max_f32_e32 v2, v0, v0
214; GFX940-NEXT:    v_mov_b32_e32 v3, s6
215; GFX940-NEXT:  .LBB1_1: ; %atomicrmw.start
216; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
217; GFX940-NEXT:    s_waitcnt vmcnt(0)
218; GFX940-NEXT:    v_max_f32_e32 v0, v1, v1
219; GFX940-NEXT:    v_min_f32_e32 v0, v0, v2
220; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
221; GFX940-NEXT:    buffer_wbl2 sc1
222; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
223; GFX940-NEXT:    s_waitcnt vmcnt(0)
224; GFX940-NEXT:    buffer_inv sc1
225; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
226; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
227; GFX940-NEXT:    v_mov_b32_e32 v1, v4
228; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
229; GFX940-NEXT:    s_cbranch_execnz .LBB1_1
230; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
231; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
232; GFX940-NEXT:    s_setpc_b64 s[30:31]
233;
234; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
235; GFX11:       ; %bb.0:
236; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237; GFX11-NEXT:    v_mov_b32_e32 v1, s16
238; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
239; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024
240; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
241; GFX11-NEXT:    buffer_gl1_inv
242; GFX11-NEXT:    buffer_gl0_inv
243; GFX11-NEXT:    s_setpc_b64 s[30:31]
244;
245; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
246; GFX10:       ; %bb.0:
247; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248; GFX10-NEXT:    v_mov_b32_e32 v1, s20
249; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
250; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024
251; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
252; GFX10-NEXT:    buffer_gl1_inv
253; GFX10-NEXT:    buffer_gl0_inv
254; GFX10-NEXT:    s_setpc_b64 s[30:31]
255;
256; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
257; GFX90A:       ; %bb.0:
258; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
260; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
261; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
262; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
263; GFX90A-NEXT:    v_max_f32_e32 v2, v0, v0
264; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
265; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
266; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
267; GFX90A-NEXT:    s_waitcnt vmcnt(0)
268; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
269; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
270; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
271; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
272; GFX90A-NEXT:    s_waitcnt vmcnt(0)
273; GFX90A-NEXT:    buffer_wbinvl1
274; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
275; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
276; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
277; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
278; GFX90A-NEXT:    s_cbranch_execnz .LBB1_1
279; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
280; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
281; GFX90A-NEXT:    s_setpc_b64 s[30:31]
282;
283; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
284; GFX908:       ; %bb.0:
285; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
286; GFX908-NEXT:    v_mov_b32_e32 v1, s20
287; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
288; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
289; GFX908-NEXT:    s_mov_b64 s[4:5], 0
290; GFX908-NEXT:    v_max_f32_e32 v2, v0, v0
291; GFX908-NEXT:    v_mov_b32_e32 v3, s6
292; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
293; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
294; GFX908-NEXT:    s_waitcnt vmcnt(0)
295; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
296; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
297; GFX908-NEXT:    v_mov_b32_e32 v5, v1
298; GFX908-NEXT:    v_mov_b32_e32 v4, v0
299; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
300; GFX908-NEXT:    s_waitcnt vmcnt(0)
301; GFX908-NEXT:    buffer_wbinvl1
302; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
303; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
304; GFX908-NEXT:    v_mov_b32_e32 v1, v4
305; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
306; GFX908-NEXT:    s_cbranch_execnz .LBB1_1
307; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
308; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
309; GFX908-NEXT:    s_setpc_b64 s[30:31]
310;
311; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
312; GFX8:       ; %bb.0:
313; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314; GFX8-NEXT:    v_mov_b32_e32 v1, s20
315; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
316; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
317; GFX8-NEXT:    s_mov_b64 s[4:5], 0
318; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v0
319; GFX8-NEXT:    v_mov_b32_e32 v3, s6
320; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
321; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
322; GFX8-NEXT:    s_waitcnt vmcnt(0)
323; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v1
324; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
325; GFX8-NEXT:    v_mov_b32_e32 v5, v1
326; GFX8-NEXT:    v_mov_b32_e32 v4, v0
327; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
328; GFX8-NEXT:    s_waitcnt vmcnt(0)
329; GFX8-NEXT:    buffer_wbinvl1
330; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
331; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
332; GFX8-NEXT:    v_mov_b32_e32 v1, v4
333; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
334; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
335; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
336; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
337; GFX8-NEXT:    s_setpc_b64 s[30:31]
338;
339; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
340; GFX7:       ; %bb.0:
341; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342; GFX7-NEXT:    v_mov_b32_e32 v1, s20
343; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024
344; GFX7-NEXT:    s_waitcnt vmcnt(0)
345; GFX7-NEXT:    buffer_wbinvl1
346; GFX7-NEXT:    s_setpc_b64 s[30:31]
347;
348; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
349; GFX6:       ; %bb.0:
350; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
351; GFX6-NEXT:    v_mov_b32_e32 v1, s20
352; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024
353; GFX6-NEXT:    s_waitcnt vmcnt(0)
354; GFX6-NEXT:    buffer_wbinvl1
355; GFX6-NEXT:    s_waitcnt expcnt(0)
356; GFX6-NEXT:    s_setpc_b64 s[30:31]
357  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
358  %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
359  ret void
360}
361
362define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 {
363; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
364; GFX12:       ; %bb.0:
365; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
366; GFX12-NEXT:    s_wait_expcnt 0x0
367; GFX12-NEXT:    s_wait_samplecnt 0x0
368; GFX12-NEXT:    s_wait_bvhcnt 0x0
369; GFX12-NEXT:    s_wait_kmcnt 0x0
370; GFX12-NEXT:    s_mov_b32 s1, exec_lo
371; GFX12-NEXT:    s_wait_storecnt 0x0
372; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
373; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
374; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
375; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
376; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
377; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
378; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
379; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
380; GFX12-NEXT:    s_wait_alu 0xfffe
381; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
382; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
383; GFX12-NEXT:    s_wait_alu 0xfffe
384; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
385; GFX12-NEXT:    s_wait_loadcnt 0x0
386; GFX12-NEXT:    buffer_atomic_min_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
387; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
388; GFX12-NEXT:    ; implicit-def: $vgpr4
389; GFX12-NEXT:    s_wait_alu 0xfffe
390; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
391; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
392; GFX12-NEXT:  ; %bb.2:
393; GFX12-NEXT:    s_mov_b32 exec_lo, s1
394; GFX12-NEXT:    s_wait_loadcnt 0x0
395; GFX12-NEXT:    v_mov_b32_e32 v0, v5
396; GFX12-NEXT:    global_inv scope:SCOPE_DEV
397; GFX12-NEXT:    s_wait_alu 0xfffe
398; GFX12-NEXT:    s_setpc_b64 s[30:31]
399;
400; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
401; GFX940:       ; %bb.0:
402; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
403; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
404; GFX940-NEXT:    s_mov_b64 s[2:3], exec
405; GFX940-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
406; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
407; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
408; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
409; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
410; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
411; GFX940-NEXT:    s_nop 0
412; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
413; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
414; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
415; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
416; GFX940-NEXT:    ; implicit-def: $vgpr4
417; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
418; GFX940-NEXT:    s_cbranch_execnz .LBB2_1
419; GFX940-NEXT:  ; %bb.2:
420; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
421; GFX940-NEXT:    s_mov_b64 s[2:3], 0
422; GFX940-NEXT:    v_max_f32_e32 v9, v5, v5
423; GFX940-NEXT:  .LBB2_3: ; %atomicrmw.start
424; GFX940-NEXT:    ; =>This Loop Header: Depth=1
425; GFX940-NEXT:    ; Child Loop BB2_4 Depth 2
426; GFX940-NEXT:    s_waitcnt vmcnt(0)
427; GFX940-NEXT:    v_max_f32_e32 v4, v7, v7
428; GFX940-NEXT:    v_min_f32_e32 v6, v4, v9
429; GFX940-NEXT:    s_mov_b64 s[8:9], exec
430; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
431; GFX940-NEXT:    buffer_wbl2 sc1
432; GFX940-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
433; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
434; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
435; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
436; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
437; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
438; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
439; GFX940-NEXT:    s_nop 0
440; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
441; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
442; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
443; GFX940-NEXT:    s_waitcnt vmcnt(0)
444; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
445; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
446; GFX940-NEXT:    s_cbranch_execnz .LBB2_4
447; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
448; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
449; GFX940-NEXT:    s_waitcnt vmcnt(0)
450; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
451; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
452; GFX940-NEXT:    v_mov_b32_e32 v7, v4
453; GFX940-NEXT:    buffer_inv sc1
454; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
455; GFX940-NEXT:    s_cbranch_execnz .LBB2_3
456; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
457; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
458; GFX940-NEXT:    v_mov_b32_e32 v0, v4
459; GFX940-NEXT:    s_setpc_b64 s[30:31]
460;
461; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
462; GFX11:       ; %bb.0:
463; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
464; GFX11-NEXT:    s_mov_b32 s1, exec_lo
465; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
466; GFX11-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
467; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
468; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
469; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
470; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
471; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
472; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
473; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
474; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
475; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
476; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
477; GFX11-NEXT:    s_waitcnt vmcnt(0)
478; GFX11-NEXT:    buffer_atomic_min_f32 v5, v4, s[4:7], 0 offen offset:1024 glc
479; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
480; GFX11-NEXT:    ; implicit-def: $vgpr4
481; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
482; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
483; GFX11-NEXT:  ; %bb.2:
484; GFX11-NEXT:    s_mov_b32 exec_lo, s1
485; GFX11-NEXT:    s_waitcnt vmcnt(0)
486; GFX11-NEXT:    v_mov_b32_e32 v0, v5
487; GFX11-NEXT:    buffer_gl1_inv
488; GFX11-NEXT:    buffer_gl0_inv
489; GFX11-NEXT:    s_setpc_b64 s[30:31]
490;
491; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
492; GFX10:       ; %bb.0:
493; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
494; GFX10-NEXT:    s_mov_b32 s5, exec_lo
495; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
496; GFX10-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
497; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
498; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
499; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
500; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
501; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
502; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
503; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
504; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
505; GFX10-NEXT:    s_waitcnt vmcnt(0)
506; GFX10-NEXT:    buffer_atomic_fmin v5, v4, s[8:11], 0 offen offset:1024 glc
507; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
508; GFX10-NEXT:    ; implicit-def: $vgpr4
509; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
510; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
511; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
512; GFX10-NEXT:  ; %bb.2:
513; GFX10-NEXT:    s_mov_b32 exec_lo, s5
514; GFX10-NEXT:    s_waitcnt vmcnt(0)
515; GFX10-NEXT:    v_mov_b32_e32 v0, v5
516; GFX10-NEXT:    buffer_gl1_inv
517; GFX10-NEXT:    buffer_gl0_inv
518; GFX10-NEXT:    s_setpc_b64 s[30:31]
519;
520; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
521; GFX90A:       ; %bb.0:
522; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
523; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
524; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
525; GFX90A-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
526; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
527; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
528; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
529; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
530; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
531; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
532; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
533; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
534; GFX90A-NEXT:    s_nop 0
535; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
536; GFX90A-NEXT:    ; implicit-def: $vgpr4
537; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
538; GFX90A-NEXT:    s_cbranch_execnz .LBB2_1
539; GFX90A-NEXT:  ; %bb.2:
540; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
541; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
542; GFX90A-NEXT:    v_max_f32_e32 v9, v5, v5
543; GFX90A-NEXT:  .LBB2_3: ; %atomicrmw.start
544; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
545; GFX90A-NEXT:    ; Child Loop BB2_4 Depth 2
546; GFX90A-NEXT:    s_waitcnt vmcnt(0)
547; GFX90A-NEXT:    v_max_f32_e32 v4, v7, v7
548; GFX90A-NEXT:    v_min_f32_e32 v6, v4, v9
549; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
550; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
551; GFX90A-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
552; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
553; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
554; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
555; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
556; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
557; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
558; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
559; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
560; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
561; GFX90A-NEXT:    s_waitcnt vmcnt(0)
562; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
563; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
564; GFX90A-NEXT:    s_cbranch_execnz .LBB2_4
565; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
566; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
567; GFX90A-NEXT:    s_waitcnt vmcnt(0)
568; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
569; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
570; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
571; GFX90A-NEXT:    buffer_wbinvl1
572; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
573; GFX90A-NEXT:    s_cbranch_execnz .LBB2_3
574; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
575; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
576; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
577; GFX90A-NEXT:    s_setpc_b64 s[30:31]
578;
579; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
580; GFX908:       ; %bb.0:
581; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
582; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
583; GFX908-NEXT:    s_mov_b64 s[6:7], exec
584; GFX908-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
585; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
586; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
587; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
588; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
589; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
590; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
591; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
592; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
593; GFX908-NEXT:    s_nop 0
594; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
595; GFX908-NEXT:    ; implicit-def: $vgpr4
596; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
597; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
598; GFX908-NEXT:  ; %bb.2:
599; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
600; GFX908-NEXT:    s_mov_b64 s[6:7], 0
601; GFX908-NEXT:    v_max_f32_e32 v8, v5, v5
602; GFX908-NEXT:  .LBB2_3: ; %atomicrmw.start
603; GFX908-NEXT:    ; =>This Loop Header: Depth=1
604; GFX908-NEXT:    ; Child Loop BB2_4 Depth 2
605; GFX908-NEXT:    s_waitcnt vmcnt(0)
606; GFX908-NEXT:    v_max_f32_e32 v4, v6, v6
607; GFX908-NEXT:    v_min_f32_e32 v5, v4, v8
608; GFX908-NEXT:    v_mov_b32_e32 v4, v5
609; GFX908-NEXT:    s_mov_b64 s[12:13], exec
610; GFX908-NEXT:    v_mov_b32_e32 v5, v6
611; GFX908-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
612; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
613; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
614; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
615; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
616; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
617; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
618; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
619; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
620; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
621; GFX908-NEXT:    s_waitcnt vmcnt(0)
622; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
623; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
624; GFX908-NEXT:    s_cbranch_execnz .LBB2_4
625; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
626; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
627; GFX908-NEXT:    s_waitcnt vmcnt(0)
628; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
629; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
630; GFX908-NEXT:    v_mov_b32_e32 v6, v4
631; GFX908-NEXT:    buffer_wbinvl1
632; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
633; GFX908-NEXT:    s_cbranch_execnz .LBB2_3
634; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
635; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
636; GFX908-NEXT:    v_mov_b32_e32 v0, v4
637; GFX908-NEXT:    s_setpc_b64 s[30:31]
638;
639; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
640; GFX8:       ; %bb.0:
641; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
642; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
643; GFX8-NEXT:    s_mov_b64 s[6:7], exec
644; GFX8-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
645; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
646; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
647; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
648; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
649; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
650; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
651; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
652; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
653; GFX8-NEXT:    s_nop 0
654; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
655; GFX8-NEXT:    ; implicit-def: $vgpr4
656; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
657; GFX8-NEXT:    s_cbranch_execnz .LBB2_1
658; GFX8-NEXT:  ; %bb.2:
659; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
660; GFX8-NEXT:    s_mov_b64 s[6:7], 0
661; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v5
662; GFX8-NEXT:  .LBB2_3: ; %atomicrmw.start
663; GFX8-NEXT:    ; =>This Loop Header: Depth=1
664; GFX8-NEXT:    ; Child Loop BB2_4 Depth 2
665; GFX8-NEXT:    s_waitcnt vmcnt(0)
666; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v6
667; GFX8-NEXT:    v_min_f32_e32 v5, v4, v8
668; GFX8-NEXT:    v_mov_b32_e32 v4, v5
669; GFX8-NEXT:    s_mov_b64 s[12:13], exec
670; GFX8-NEXT:    v_mov_b32_e32 v5, v6
671; GFX8-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
672; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
673; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
674; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
675; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
676; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
677; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
678; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
679; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
680; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
681; GFX8-NEXT:    s_waitcnt vmcnt(0)
682; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
683; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
684; GFX8-NEXT:    s_cbranch_execnz .LBB2_4
685; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
686; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
687; GFX8-NEXT:    s_waitcnt vmcnt(0)
688; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
689; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
690; GFX8-NEXT:    v_mov_b32_e32 v6, v4
691; GFX8-NEXT:    buffer_wbinvl1
692; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
693; GFX8-NEXT:    s_cbranch_execnz .LBB2_3
694; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
695; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
696; GFX8-NEXT:    v_mov_b32_e32 v0, v4
697; GFX8-NEXT:    s_setpc_b64 s[30:31]
698;
699; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
700; GFX7:       ; %bb.0:
701; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
702; GFX7-NEXT:    s_mov_b64 s[6:7], exec
703; GFX7-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
704; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
705; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
706; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
707; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
708; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
709; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
710; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
711; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
712; GFX7-NEXT:    s_waitcnt vmcnt(0)
713; GFX7-NEXT:    buffer_atomic_fmin v5, v4, s[8:11], 0 offen offset:1024 glc
714; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
715; GFX7-NEXT:    ; implicit-def: $vgpr4
716; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
717; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
718; GFX7-NEXT:  ; %bb.2:
719; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
720; GFX7-NEXT:    s_waitcnt vmcnt(0)
721; GFX7-NEXT:    v_mov_b32_e32 v0, v5
722; GFX7-NEXT:    buffer_wbinvl1
723; GFX7-NEXT:    s_setpc_b64 s[30:31]
724;
725; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
726; GFX6:       ; %bb.0:
727; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
728; GFX6-NEXT:    s_mov_b64 s[6:7], exec
729; GFX6-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
730; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
731; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
732; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
733; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
734; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
735; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
736; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
737; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
738; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
739; GFX6-NEXT:    buffer_atomic_fmin v5, v4, s[8:11], 0 offen offset:1024 glc
740; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
741; GFX6-NEXT:    ; implicit-def: $vgpr4
742; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
743; GFX6-NEXT:    s_cbranch_execnz .LBB2_1
744; GFX6-NEXT:  ; %bb.2:
745; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
746; GFX6-NEXT:    s_waitcnt vmcnt(0)
747; GFX6-NEXT:    v_mov_b32_e32 v0, v5
748; GFX6-NEXT:    buffer_wbinvl1
749; GFX6-NEXT:    s_waitcnt expcnt(0)
750; GFX6-NEXT:    s_setpc_b64 s[30:31]
751  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
752  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
753  ret float %result
754}
755
756define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
757; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
758; GFX12:       ; %bb.0:
759; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
760; GFX12-NEXT:    s_wait_expcnt 0x0
761; GFX12-NEXT:    s_wait_samplecnt 0x0
762; GFX12-NEXT:    s_wait_bvhcnt 0x0
763; GFX12-NEXT:    s_wait_kmcnt 0x0
764; GFX12-NEXT:    v_mov_b32_e32 v1, s16
765; GFX12-NEXT:    s_wait_storecnt 0x0
766; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
767; GFX12-NEXT:    s_wait_loadcnt 0x0
768; GFX12-NEXT:    global_inv scope:SCOPE_DEV
769; GFX12-NEXT:    s_setpc_b64 s[30:31]
770;
771; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
772; GFX940:       ; %bb.0:
773; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
774; GFX940-NEXT:    v_mov_b32_e32 v1, v0
775; GFX940-NEXT:    v_mov_b32_e32 v0, s16
776; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
777; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
778; GFX940-NEXT:    s_mov_b64 s[4:5], 0
779; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
780; GFX940-NEXT:    v_mov_b32_e32 v3, s6
781; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
782; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
783; GFX940-NEXT:    s_waitcnt vmcnt(0)
784; GFX940-NEXT:    v_mov_b32_e32 v5, v0
785; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
786; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
787; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
788; GFX940-NEXT:    buffer_wbl2 sc1
789; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
790; GFX940-NEXT:    s_waitcnt vmcnt(0)
791; GFX940-NEXT:    buffer_inv sc1
792; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
793; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
794; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
795; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
796; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
797; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
798; GFX940-NEXT:    s_setpc_b64 s[30:31]
799;
800; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
801; GFX11:       ; %bb.0:
802; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
803; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
804; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
805; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
806; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
807; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
808; GFX11-NEXT:    s_mov_b32 s4, 0
809; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
810; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
811; GFX11-NEXT:    s_waitcnt vmcnt(0)
812; GFX11-NEXT:    v_mov_b32_e32 v5, v0
813; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
814; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
815; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
816; GFX11-NEXT:    v_min_f32_e32 v4, v0, v2
817; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
818; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
819; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
820; GFX11-NEXT:    s_waitcnt vmcnt(0)
821; GFX11-NEXT:    buffer_gl1_inv
822; GFX11-NEXT:    buffer_gl0_inv
823; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
824; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
825; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
826; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
827; GFX11-NEXT:    s_cbranch_execnz .LBB3_1
828; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
829; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
830; GFX11-NEXT:    s_setpc_b64 s[30:31]
831;
832; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
833; GFX10:       ; %bb.0:
834; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
835; GFX10-NEXT:    v_mov_b32_e32 v1, v0
836; GFX10-NEXT:    v_mov_b32_e32 v0, s20
837; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
838; GFX10-NEXT:    v_mov_b32_e32 v3, s4
839; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
840; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
841; GFX10-NEXT:    s_mov_b32 s4, 0
842; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
843; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
844; GFX10-NEXT:    s_waitcnt vmcnt(0)
845; GFX10-NEXT:    v_mov_b32_e32 v5, v0
846; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
847; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
848; GFX10-NEXT:    v_min_f32_e32 v4, v0, v2
849; GFX10-NEXT:    v_mov_b32_e32 v0, v4
850; GFX10-NEXT:    v_mov_b32_e32 v1, v5
851; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
852; GFX10-NEXT:    s_waitcnt vmcnt(0)
853; GFX10-NEXT:    buffer_gl1_inv
854; GFX10-NEXT:    buffer_gl0_inv
855; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
856; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
857; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
858; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
859; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
860; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
861; GFX10-NEXT:    s_setpc_b64 s[30:31]
862;
863; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
864; GFX90A:       ; %bb.0:
865; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
866; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
867; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
868; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
869; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
870; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
871; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
872; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
873; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
874; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
875; GFX90A-NEXT:    s_waitcnt vmcnt(0)
876; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
877; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
878; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
879; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
880; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
881; GFX90A-NEXT:    s_waitcnt vmcnt(0)
882; GFX90A-NEXT:    buffer_wbinvl1
883; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
884; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
885; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
886; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
887; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
888; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
889; GFX90A-NEXT:    s_setpc_b64 s[30:31]
890;
891; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
892; GFX908:       ; %bb.0:
893; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
894; GFX908-NEXT:    v_mov_b32_e32 v1, v0
895; GFX908-NEXT:    v_mov_b32_e32 v0, s20
896; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
897; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
898; GFX908-NEXT:    s_mov_b64 s[4:5], 0
899; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
900; GFX908-NEXT:    v_mov_b32_e32 v3, s6
901; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
902; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
903; GFX908-NEXT:    s_waitcnt vmcnt(0)
904; GFX908-NEXT:    v_mov_b32_e32 v5, v0
905; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
906; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
907; GFX908-NEXT:    v_mov_b32_e32 v0, v4
908; GFX908-NEXT:    v_mov_b32_e32 v1, v5
909; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
910; GFX908-NEXT:    s_waitcnt vmcnt(0)
911; GFX908-NEXT:    buffer_wbinvl1
912; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
913; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
914; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
915; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
916; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
917; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
918; GFX908-NEXT:    s_setpc_b64 s[30:31]
919;
920; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
921; GFX8:       ; %bb.0:
922; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
923; GFX8-NEXT:    v_mov_b32_e32 v1, v0
924; GFX8-NEXT:    v_mov_b32_e32 v0, s20
925; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
926; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
927; GFX8-NEXT:    s_mov_b64 s[4:5], 0
928; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
929; GFX8-NEXT:    v_mov_b32_e32 v3, s6
930; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
931; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
932; GFX8-NEXT:    s_waitcnt vmcnt(0)
933; GFX8-NEXT:    v_mov_b32_e32 v5, v0
934; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
935; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
936; GFX8-NEXT:    v_mov_b32_e32 v0, v4
937; GFX8-NEXT:    v_mov_b32_e32 v1, v5
938; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
939; GFX8-NEXT:    s_waitcnt vmcnt(0)
940; GFX8-NEXT:    buffer_wbinvl1
941; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
942; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
943; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
944; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
945; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
946; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
947; GFX8-NEXT:    s_setpc_b64 s[30:31]
948;
949; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
950; GFX7:       ; %bb.0:
951; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
952; GFX7-NEXT:    v_mov_b32_e32 v1, v0
953; GFX7-NEXT:    v_mov_b32_e32 v0, s20
954; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
955; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
956; GFX7-NEXT:    s_mov_b64 s[4:5], 0
957; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
958; GFX7-NEXT:    v_mov_b32_e32 v3, s6
959; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
960; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
961; GFX7-NEXT:    s_waitcnt vmcnt(0)
962; GFX7-NEXT:    v_mov_b32_e32 v5, v0
963; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
964; GFX7-NEXT:    v_min_f32_e32 v4, v0, v2
965; GFX7-NEXT:    v_mov_b32_e32 v0, v4
966; GFX7-NEXT:    v_mov_b32_e32 v1, v5
967; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
968; GFX7-NEXT:    s_waitcnt vmcnt(0)
969; GFX7-NEXT:    buffer_wbinvl1
970; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
971; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
972; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
973; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
974; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
975; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
976; GFX7-NEXT:    s_setpc_b64 s[30:31]
977;
978; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
979; GFX6:       ; %bb.0:
980; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
981; GFX6-NEXT:    v_mov_b32_e32 v1, v0
982; GFX6-NEXT:    v_mov_b32_e32 v0, s20
983; GFX6-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
984; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
985; GFX6-NEXT:    s_mov_b64 s[4:5], 0
986; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
987; GFX6-NEXT:    v_mov_b32_e32 v3, s6
988; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
989; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
990; GFX6-NEXT:    s_waitcnt vmcnt(0)
991; GFX6-NEXT:    v_mov_b32_e32 v5, v0
992; GFX6-NEXT:    s_waitcnt expcnt(0)
993; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v5
994; GFX6-NEXT:    v_min_f32_e32 v4, v0, v2
995; GFX6-NEXT:    v_mov_b32_e32 v0, v4
996; GFX6-NEXT:    v_mov_b32_e32 v1, v5
997; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
998; GFX6-NEXT:    s_waitcnt vmcnt(0)
999; GFX6-NEXT:    buffer_wbinvl1
1000; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1001; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1002; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1003; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
1004; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
1005; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
1006; GFX6-NEXT:    s_waitcnt expcnt(0)
1007; GFX6-NEXT:    s_setpc_b64 s[30:31]
1008  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
1009  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
1010  ret float %result
1011}
1012
1013define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
1014; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1015; GFX12:       ; %bb.0:
1016; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1017; GFX12-NEXT:    s_wait_expcnt 0x0
1018; GFX12-NEXT:    s_wait_samplecnt 0x0
1019; GFX12-NEXT:    s_wait_bvhcnt 0x0
1020; GFX12-NEXT:    s_wait_kmcnt 0x0
1021; GFX12-NEXT:    v_mov_b32_e32 v1, s16
1022; GFX12-NEXT:    s_wait_storecnt 0x0
1023; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
1024; GFX12-NEXT:    s_wait_loadcnt 0x0
1025; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1026; GFX12-NEXT:    s_setpc_b64 s[30:31]
1027;
1028; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1029; GFX940:       ; %bb.0:
1030; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1031; GFX940-NEXT:    v_mov_b32_e32 v1, v0
1032; GFX940-NEXT:    v_mov_b32_e32 v0, s16
1033; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
1034; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
1035; GFX940-NEXT:    s_mov_b64 s[4:5], 0
1036; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
1037; GFX940-NEXT:    v_mov_b32_e32 v3, s6
1038; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
1039; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1040; GFX940-NEXT:    s_waitcnt vmcnt(0)
1041; GFX940-NEXT:    v_mov_b32_e32 v5, v0
1042; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
1043; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
1044; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
1045; GFX940-NEXT:    buffer_wbl2 sc1
1046; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
1047; GFX940-NEXT:    s_waitcnt vmcnt(0)
1048; GFX940-NEXT:    buffer_inv sc1
1049; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1050; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1051; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1052; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
1053; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1054; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
1055; GFX940-NEXT:    s_setpc_b64 s[30:31]
1056;
1057; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1058; GFX11:       ; %bb.0:
1059; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1060; GFX11-NEXT:    v_mov_b32_e32 v1, s16
1061; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1062; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
1063; GFX11-NEXT:    s_waitcnt vmcnt(0)
1064; GFX11-NEXT:    buffer_gl1_inv
1065; GFX11-NEXT:    buffer_gl0_inv
1066; GFX11-NEXT:    s_setpc_b64 s[30:31]
1067;
1068; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1069; GFX10:       ; %bb.0:
1070; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1071; GFX10-NEXT:    v_mov_b32_e32 v1, s20
1072; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1073; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc
1074; GFX10-NEXT:    s_waitcnt vmcnt(0)
1075; GFX10-NEXT:    buffer_gl1_inv
1076; GFX10-NEXT:    buffer_gl0_inv
1077; GFX10-NEXT:    s_setpc_b64 s[30:31]
1078;
1079; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1080; GFX90A:       ; %bb.0:
1081; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1082; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
1083; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
1084; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1085; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
1086; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1087; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
1088; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
1089; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
1090; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1091; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1092; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
1093; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
1094; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
1095; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
1096; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1097; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1098; GFX90A-NEXT:    buffer_wbinvl1
1099; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1100; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1101; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1102; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
1103; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1104; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1105; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1106;
1107; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1108; GFX908:       ; %bb.0:
1109; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1110; GFX908-NEXT:    v_mov_b32_e32 v1, v0
1111; GFX908-NEXT:    v_mov_b32_e32 v0, s20
1112; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1113; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
1114; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1115; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
1116; GFX908-NEXT:    v_mov_b32_e32 v3, s6
1117; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
1118; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1119; GFX908-NEXT:    s_waitcnt vmcnt(0)
1120; GFX908-NEXT:    v_mov_b32_e32 v5, v0
1121; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
1122; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
1123; GFX908-NEXT:    v_mov_b32_e32 v0, v4
1124; GFX908-NEXT:    v_mov_b32_e32 v1, v5
1125; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1126; GFX908-NEXT:    s_waitcnt vmcnt(0)
1127; GFX908-NEXT:    buffer_wbinvl1
1128; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1129; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1130; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1131; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
1132; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1133; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1134; GFX908-NEXT:    s_setpc_b64 s[30:31]
1135;
1136; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1137; GFX8:       ; %bb.0:
1138; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1139; GFX8-NEXT:    v_mov_b32_e32 v1, v0
1140; GFX8-NEXT:    v_mov_b32_e32 v0, s20
1141; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
1142; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
1143; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1144; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
1145; GFX8-NEXT:    v_mov_b32_e32 v3, s6
1146; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
1147; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1148; GFX8-NEXT:    s_waitcnt vmcnt(0)
1149; GFX8-NEXT:    v_mov_b32_e32 v5, v0
1150; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
1151; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
1152; GFX8-NEXT:    v_mov_b32_e32 v0, v4
1153; GFX8-NEXT:    v_mov_b32_e32 v1, v5
1154; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
1155; GFX8-NEXT:    s_waitcnt vmcnt(0)
1156; GFX8-NEXT:    buffer_wbinvl1
1157; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1158; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1159; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1160; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
1161; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1162; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1163; GFX8-NEXT:    s_setpc_b64 s[30:31]
1164;
1165; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1166; GFX7:       ; %bb.0:
1167; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1168; GFX7-NEXT:    v_mov_b32_e32 v1, s20
1169; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc
1170; GFX7-NEXT:    s_waitcnt vmcnt(0)
1171; GFX7-NEXT:    buffer_wbinvl1
1172; GFX7-NEXT:    s_setpc_b64 s[30:31]
1173;
1174; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1175; GFX6:       ; %bb.0:
1176; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1177; GFX6-NEXT:    v_mov_b32_e32 v1, s20
1178; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc
1179; GFX6-NEXT:    s_waitcnt vmcnt(0)
1180; GFX6-NEXT:    buffer_wbinvl1
1181; GFX6-NEXT:    s_waitcnt expcnt(0)
1182; GFX6-NEXT:    s_setpc_b64 s[30:31]
1183  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
1184  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
1185  ret float %result
1186}
1187
1188; --------------------------------------------------------------------
1189; double
1190; --------------------------------------------------------------------
1191
1192define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
1193; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
1194; GFX12:       ; %bb.0:
1195; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1196; GFX12-NEXT:    s_wait_expcnt 0x0
1197; GFX12-NEXT:    s_wait_samplecnt 0x0
1198; GFX12-NEXT:    s_wait_bvhcnt 0x0
1199; GFX12-NEXT:    s_wait_kmcnt 0x0
1200; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
1201; GFX12-NEXT:    v_mov_b32_e32 v0, s16
1202; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
1203; GFX12-NEXT:    s_wait_alu 0xfffe
1204; GFX12-NEXT:    v_mov_b32_e32 v6, s4
1205; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
1206; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
1207; GFX12-NEXT:    s_mov_b32 s4, 0
1208; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
1209; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1210; GFX12-NEXT:    s_wait_loadcnt 0x0
1211; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1212; GFX12-NEXT:    s_wait_storecnt 0x0
1213; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1214; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
1215; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
1216; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1217; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1218; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1219; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
1220; GFX12-NEXT:    s_wait_loadcnt 0x0
1221; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1222; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
1223; GFX12-NEXT:    s_wait_alu 0xfffe
1224; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
1225; GFX12-NEXT:    s_wait_alu 0xfffe
1226; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
1227; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
1228; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1229; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1230; GFX12-NEXT:    s_wait_alu 0xfffe
1231; GFX12-NEXT:    s_setpc_b64 s[30:31]
1232;
1233; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
1234; GFX940:       ; %bb.0:
1235; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1236; GFX940-NEXT:    v_mov_b32_e32 v2, s16
1237; GFX940-NEXT:    buffer_wbl2 sc1
1238; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
1239; GFX940-NEXT:    s_waitcnt vmcnt(0)
1240; GFX940-NEXT:    buffer_inv sc1
1241; GFX940-NEXT:    s_setpc_b64 s[30:31]
1242;
1243; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
1244; GFX11:       ; %bb.0:
1245; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1246; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
1247; GFX11-NEXT:    v_mov_b32_e32 v0, s16
1248; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
1249; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
1250; GFX11-NEXT:    v_mov_b32_e32 v6, s4
1251; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
1252; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
1253; GFX11-NEXT:    s_mov_b32 s4, 0
1254; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
1255; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1256; GFX11-NEXT:    s_waitcnt vmcnt(0)
1257; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1258; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1259; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1260; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
1261; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
1262; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1263; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1264; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1265; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
1266; GFX11-NEXT:    s_waitcnt vmcnt(0)
1267; GFX11-NEXT:    buffer_gl1_inv
1268; GFX11-NEXT:    buffer_gl0_inv
1269; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
1270; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
1271; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1272; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
1273; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
1274; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1275; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1276; GFX11-NEXT:    s_setpc_b64 s[30:31]
1277;
1278; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
1279; GFX10:       ; %bb.0:
1280; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1281; GFX10-NEXT:    v_mov_b32_e32 v2, s20
1282; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1283; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc
1284; GFX10-NEXT:    s_waitcnt vmcnt(0)
1285; GFX10-NEXT:    buffer_gl1_inv
1286; GFX10-NEXT:    buffer_gl0_inv
1287; GFX10-NEXT:    s_setpc_b64 s[30:31]
1288;
1289; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
1290; GFX90A:       ; %bb.0:
1291; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1292; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
1293; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc
1294; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1295; GFX90A-NEXT:    buffer_wbinvl1
1296; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1297;
1298; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
1299; GFX908:       ; %bb.0:
1300; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1301; GFX908-NEXT:    v_mov_b32_e32 v2, v0
1302; GFX908-NEXT:    v_mov_b32_e32 v0, s20
1303; GFX908-NEXT:    v_mov_b32_e32 v3, v1
1304; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
1305; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
1306; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
1307; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1308; GFX908-NEXT:    v_mov_b32_e32 v6, s6
1309; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
1310; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1311; GFX908-NEXT:    s_waitcnt vmcnt(0)
1312; GFX908-NEXT:    v_mov_b32_e32 v10, v1
1313; GFX908-NEXT:    v_mov_b32_e32 v9, v0
1314; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
1315; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
1316; GFX908-NEXT:    v_mov_b32_e32 v0, v7
1317; GFX908-NEXT:    v_mov_b32_e32 v1, v8
1318; GFX908-NEXT:    v_mov_b32_e32 v2, v9
1319; GFX908-NEXT:    v_mov_b32_e32 v3, v10
1320; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
1321; GFX908-NEXT:    s_waitcnt vmcnt(0)
1322; GFX908-NEXT:    buffer_wbinvl1
1323; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
1324; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1325; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1326; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
1327; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1328; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1329; GFX908-NEXT:    s_setpc_b64 s[30:31]
1330;
1331; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
1332; GFX8:       ; %bb.0:
1333; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1334; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1335; GFX8-NEXT:    v_mov_b32_e32 v0, s20
1336; GFX8-NEXT:    v_mov_b32_e32 v3, v1
1337; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
1338; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
1339; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
1340; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1341; GFX8-NEXT:    v_mov_b32_e32 v6, s6
1342; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
1343; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1344; GFX8-NEXT:    s_waitcnt vmcnt(0)
1345; GFX8-NEXT:    v_mov_b32_e32 v10, v1
1346; GFX8-NEXT:    v_mov_b32_e32 v9, v0
1347; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
1348; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
1349; GFX8-NEXT:    v_mov_b32_e32 v0, v7
1350; GFX8-NEXT:    v_mov_b32_e32 v1, v8
1351; GFX8-NEXT:    v_mov_b32_e32 v2, v9
1352; GFX8-NEXT:    v_mov_b32_e32 v3, v10
1353; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
1354; GFX8-NEXT:    s_waitcnt vmcnt(0)
1355; GFX8-NEXT:    buffer_wbinvl1
1356; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
1357; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1358; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1359; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
1360; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1361; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1362; GFX8-NEXT:    s_setpc_b64 s[30:31]
1363;
1364; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
1365; GFX7:       ; %bb.0:
1366; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1367; GFX7-NEXT:    v_mov_b32_e32 v2, s20
1368; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc
1369; GFX7-NEXT:    s_waitcnt vmcnt(0)
1370; GFX7-NEXT:    buffer_wbinvl1
1371; GFX7-NEXT:    s_setpc_b64 s[30:31]
1372;
1373; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
1374; GFX6:       ; %bb.0:
1375; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1376; GFX6-NEXT:    v_mov_b32_e32 v2, s20
1377; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc
1378; GFX6-NEXT:    s_waitcnt vmcnt(0)
1379; GFX6-NEXT:    buffer_wbinvl1
1380; GFX6-NEXT:    s_waitcnt expcnt(0)
1381; GFX6-NEXT:    s_setpc_b64 s[30:31]
1382  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
1383  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1384  ret double %result
1385}
1386
1387define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
1388; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
1389; GFX12:       ; %bb.0:
1390; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1391; GFX12-NEXT:    s_wait_expcnt 0x0
1392; GFX12-NEXT:    s_wait_samplecnt 0x0
1393; GFX12-NEXT:    s_wait_bvhcnt 0x0
1394; GFX12-NEXT:    s_wait_kmcnt 0x0
1395; GFX12-NEXT:    v_mov_b32_e32 v2, s16
1396; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
1397; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
1398; GFX12-NEXT:    s_wait_alu 0xfffe
1399; GFX12-NEXT:    v_mov_b32_e32 v6, s4
1400; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
1401; GFX12-NEXT:    s_mov_b32 s4, 0
1402; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
1403; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1404; GFX12-NEXT:    s_wait_loadcnt 0x0
1405; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
1406; GFX12-NEXT:    s_wait_storecnt 0x0
1407; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1408; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
1409; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
1410; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
1411; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
1412; GFX12-NEXT:    s_wait_loadcnt 0x0
1413; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1414; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
1415; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
1416; GFX12-NEXT:    s_wait_alu 0xfffe
1417; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
1418; GFX12-NEXT:    s_wait_alu 0xfffe
1419; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
1420; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
1421; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1422; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1423; GFX12-NEXT:    s_wait_alu 0xfffe
1424; GFX12-NEXT:    s_setpc_b64 s[30:31]
1425;
1426; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
1427; GFX940:       ; %bb.0:
1428; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1429; GFX940-NEXT:    v_mov_b32_e32 v2, s16
1430; GFX940-NEXT:    buffer_wbl2 sc1
1431; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
1432; GFX940-NEXT:    s_waitcnt vmcnt(0)
1433; GFX940-NEXT:    buffer_inv sc1
1434; GFX940-NEXT:    s_setpc_b64 s[30:31]
1435;
1436; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
1437; GFX11:       ; %bb.0:
1438; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1439; GFX11-NEXT:    v_mov_b32_e32 v2, s16
1440; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
1441; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
1442; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1443; GFX11-NEXT:    v_mov_b32_e32 v6, s4
1444; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
1445; GFX11-NEXT:    s_mov_b32 s4, 0
1446; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
1447; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1448; GFX11-NEXT:    s_waitcnt vmcnt(0)
1449; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
1450; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1451; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1452; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
1453; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
1454; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
1455; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
1456; GFX11-NEXT:    s_waitcnt vmcnt(0)
1457; GFX11-NEXT:    buffer_gl1_inv
1458; GFX11-NEXT:    buffer_gl0_inv
1459; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
1460; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
1461; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
1462; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1463; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
1464; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
1465; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1466; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1467; GFX11-NEXT:    s_setpc_b64 s[30:31]
1468;
1469; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
1470; GFX10:       ; %bb.0:
1471; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1472; GFX10-NEXT:    v_mov_b32_e32 v2, s20
1473; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1474; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048
1475; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1476; GFX10-NEXT:    buffer_gl1_inv
1477; GFX10-NEXT:    buffer_gl0_inv
1478; GFX10-NEXT:    s_setpc_b64 s[30:31]
1479;
1480; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
1481; GFX90A:       ; %bb.0:
1482; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1483; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
1484; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[16:19], 0 offen offset:2048
1485; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1486; GFX90A-NEXT:    buffer_wbinvl1
1487; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1488;
1489; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
1490; GFX908:       ; %bb.0:
1491; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1492; GFX908-NEXT:    v_mov_b32_e32 v2, s20
1493; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
1494; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
1495; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
1496; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1497; GFX908-NEXT:    v_mov_b32_e32 v6, s6
1498; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
1499; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1500; GFX908-NEXT:    s_waitcnt vmcnt(0)
1501; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
1502; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
1503; GFX908-NEXT:    v_mov_b32_e32 v10, v3
1504; GFX908-NEXT:    v_mov_b32_e32 v9, v2
1505; GFX908-NEXT:    v_mov_b32_e32 v8, v1
1506; GFX908-NEXT:    v_mov_b32_e32 v7, v0
1507; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
1508; GFX908-NEXT:    s_waitcnt vmcnt(0)
1509; GFX908-NEXT:    buffer_wbinvl1
1510; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
1511; GFX908-NEXT:    v_mov_b32_e32 v2, v7
1512; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1513; GFX908-NEXT:    v_mov_b32_e32 v3, v8
1514; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1515; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
1516; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1517; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1518; GFX908-NEXT:    s_setpc_b64 s[30:31]
1519;
1520; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
1521; GFX8:       ; %bb.0:
1522; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1523; GFX8-NEXT:    v_mov_b32_e32 v2, s20
1524; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
1525; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
1526; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
1527; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1528; GFX8-NEXT:    v_mov_b32_e32 v6, s6
1529; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
1530; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1531; GFX8-NEXT:    s_waitcnt vmcnt(0)
1532; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
1533; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
1534; GFX8-NEXT:    v_mov_b32_e32 v10, v3
1535; GFX8-NEXT:    v_mov_b32_e32 v9, v2
1536; GFX8-NEXT:    v_mov_b32_e32 v8, v1
1537; GFX8-NEXT:    v_mov_b32_e32 v7, v0
1538; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
1539; GFX8-NEXT:    s_waitcnt vmcnt(0)
1540; GFX8-NEXT:    buffer_wbinvl1
1541; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
1542; GFX8-NEXT:    v_mov_b32_e32 v2, v7
1543; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1544; GFX8-NEXT:    v_mov_b32_e32 v3, v8
1545; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1546; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
1547; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1548; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1549; GFX8-NEXT:    s_setpc_b64 s[30:31]
1550;
1551; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
1552; GFX7:       ; %bb.0:
1553; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1554; GFX7-NEXT:    v_mov_b32_e32 v2, s20
1555; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048
1556; GFX7-NEXT:    s_waitcnt vmcnt(0)
1557; GFX7-NEXT:    buffer_wbinvl1
1558; GFX7-NEXT:    s_setpc_b64 s[30:31]
1559;
1560; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
1561; GFX6:       ; %bb.0:
1562; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1563; GFX6-NEXT:    v_mov_b32_e32 v2, s20
1564; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048
1565; GFX6-NEXT:    s_waitcnt vmcnt(0)
1566; GFX6-NEXT:    buffer_wbinvl1
1567; GFX6-NEXT:    s_waitcnt expcnt(0)
1568; GFX6-NEXT:    s_setpc_b64 s[30:31]
1569  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
1570  %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1571  ret void
1572}
1573
1574define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 {
1575; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
1576; GFX12:       ; %bb.0:
1577; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1578; GFX12-NEXT:    s_wait_expcnt 0x0
1579; GFX12-NEXT:    s_wait_samplecnt 0x0
1580; GFX12-NEXT:    s_wait_bvhcnt 0x0
1581; GFX12-NEXT:    s_wait_kmcnt 0x0
1582; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
1583; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1584; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
1585; GFX12-NEXT:    s_mov_b32 s1, exec_lo
1586; GFX12-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
1587; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1588; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
1589; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
1590; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
1591; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
1592; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1593; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
1594; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
1595; GFX12-NEXT:    s_wait_alu 0xfffe
1596; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1597; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
1598; GFX12-NEXT:    s_wait_alu 0xfffe
1599; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
1600; GFX12-NEXT:    s_wait_loadcnt 0x0
1601; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
1602; GFX12-NEXT:    ; implicit-def: $vgpr4
1603; GFX12-NEXT:    s_wait_alu 0xfffe
1604; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
1605; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
1606; GFX12-NEXT:  ; %bb.2:
1607; GFX12-NEXT:    s_mov_b32 exec_lo, s1
1608; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
1609; GFX12-NEXT:    s_mov_b32 s1, 0
1610; GFX12-NEXT:  .LBB7_3: ; %atomicrmw.start
1611; GFX12-NEXT:    ; =>This Loop Header: Depth=1
1612; GFX12-NEXT:    ; Child Loop BB7_4 Depth 2
1613; GFX12-NEXT:    s_wait_loadcnt 0x0
1614; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
1615; GFX12-NEXT:    s_mov_b32 s2, exec_lo
1616; GFX12-NEXT:    s_wait_storecnt 0x0
1617; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1618; GFX12-NEXT:    v_min_num_f64_e32 v[11:12], v[0:1], v[4:5]
1619; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
1620; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
1621; GFX12-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
1622; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
1623; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
1624; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
1625; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
1626; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
1627; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1628; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
1629; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
1630; GFX12-NEXT:    s_wait_alu 0xfffe
1631; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1632; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
1633; GFX12-NEXT:    s_wait_alu 0xfffe
1634; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
1635; GFX12-NEXT:    s_wait_loadcnt 0x0
1636; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
1637; GFX12-NEXT:    s_wait_alu 0xfffe
1638; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
1639; GFX12-NEXT:    s_cbranch_execnz .LBB7_4
1640; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
1641; GFX12-NEXT:    s_mov_b32 exec_lo, s2
1642; GFX12-NEXT:    s_wait_loadcnt 0x0
1643; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
1644; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
1645; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1646; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
1647; GFX12-NEXT:    s_wait_alu 0xfffe
1648; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
1649; GFX12-NEXT:    s_cbranch_execnz .LBB7_3
1650; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
1651; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1652; GFX12-NEXT:    s_wait_alu 0xfffe
1653; GFX12-NEXT:    s_setpc_b64 s[30:31]
1654;
1655; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
1656; GFX940:       ; %bb.0:
1657; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1658; GFX940-NEXT:    v_mov_b32_e32 v7, v6
1659; GFX940-NEXT:    v_mov_b32_e32 v6, v5
1660; GFX940-NEXT:    s_mov_b64 s[2:3], exec
1661; GFX940-NEXT:    buffer_wbl2 sc1
1662; GFX940-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
1663; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
1664; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
1665; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
1666; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
1667; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
1668; GFX940-NEXT:    s_nop 0
1669; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
1670; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
1671; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
1672; GFX940-NEXT:    s_waitcnt vmcnt(0)
1673; GFX940-NEXT:    buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
1674; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1675; GFX940-NEXT:    ; implicit-def: $vgpr4
1676; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
1677; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
1678; GFX940-NEXT:  ; %bb.2:
1679; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
1680; GFX940-NEXT:    s_waitcnt vmcnt(0)
1681; GFX940-NEXT:    v_mov_b32_e32 v0, v6
1682; GFX940-NEXT:    v_mov_b32_e32 v1, v7
1683; GFX940-NEXT:    buffer_inv sc1
1684; GFX940-NEXT:    s_setpc_b64 s[30:31]
1685;
1686; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
1687; GFX11:       ; %bb.0:
1688; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1689; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
1690; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1691; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
1692; GFX11-NEXT:    s_mov_b32 s1, 0
1693; GFX11-NEXT:    s_mov_b32 s2, exec_lo
1694; GFX11-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
1695; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
1696; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
1697; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
1698; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
1699; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
1700; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
1701; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1702; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
1703; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
1704; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1705; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
1706; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
1707; GFX11-NEXT:    ; implicit-def: $vgpr4
1708; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
1709; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
1710; GFX11-NEXT:  ; %bb.2:
1711; GFX11-NEXT:    s_mov_b32 exec_lo, s2
1712; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
1713; GFX11-NEXT:    .p2align 6
1714; GFX11-NEXT:  .LBB7_3: ; %atomicrmw.start
1715; GFX11-NEXT:    ; =>This Loop Header: Depth=1
1716; GFX11-NEXT:    ; Child Loop BB7_4 Depth 2
1717; GFX11-NEXT:    s_waitcnt vmcnt(0)
1718; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
1719; GFX11-NEXT:    s_mov_b32 s2, exec_lo
1720; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1721; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1722; GFX11-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
1723; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
1724; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
1725; GFX11-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
1726; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
1727; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
1728; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
1729; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
1730; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
1731; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1732; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
1733; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
1734; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1735; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
1736; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
1737; GFX11-NEXT:    s_waitcnt vmcnt(0)
1738; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
1739; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
1740; GFX11-NEXT:    s_cbranch_execnz .LBB7_4
1741; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
1742; GFX11-NEXT:    s_mov_b32 exec_lo, s2
1743; GFX11-NEXT:    s_waitcnt vmcnt(0)
1744; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
1745; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
1746; GFX11-NEXT:    buffer_gl1_inv
1747; GFX11-NEXT:    buffer_gl0_inv
1748; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
1749; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1750; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
1751; GFX11-NEXT:    s_cbranch_execnz .LBB7_3
1752; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
1753; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1754; GFX11-NEXT:    s_setpc_b64 s[30:31]
1755;
1756; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
1757; GFX10:       ; %bb.0:
1758; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1759; GFX10-NEXT:    s_mov_b32 s5, exec_lo
1760; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1761; GFX10-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
1762; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
1763; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
1764; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
1765; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
1766; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
1767; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
1768; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
1769; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
1770; GFX10-NEXT:    s_waitcnt vmcnt(0)
1771; GFX10-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
1772; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1773; GFX10-NEXT:    ; implicit-def: $vgpr4
1774; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
1775; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
1776; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
1777; GFX10-NEXT:  ; %bb.2:
1778; GFX10-NEXT:    s_mov_b32 exec_lo, s5
1779; GFX10-NEXT:    s_waitcnt vmcnt(0)
1780; GFX10-NEXT:    v_mov_b32_e32 v0, v5
1781; GFX10-NEXT:    v_mov_b32_e32 v1, v6
1782; GFX10-NEXT:    buffer_gl1_inv
1783; GFX10-NEXT:    buffer_gl0_inv
1784; GFX10-NEXT:    s_setpc_b64 s[30:31]
1785;
1786; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
1787; GFX90A:       ; %bb.0:
1788; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1789; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
1790; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
1791; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
1792; GFX90A-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
1793; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
1794; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
1795; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
1796; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
1797; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
1798; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
1799; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
1800; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
1801; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1802; GFX90A-NEXT:    buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
1803; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1804; GFX90A-NEXT:    ; implicit-def: $vgpr4
1805; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
1806; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
1807; GFX90A-NEXT:  ; %bb.2:
1808; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
1809; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1810; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
1811; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
1812; GFX90A-NEXT:    buffer_wbinvl1
1813; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1814;
1815; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
1816; GFX908:       ; %bb.0:
1817; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1818; GFX908-NEXT:    v_mov_b32_e32 v8, v3
1819; GFX908-NEXT:    v_mov_b32_e32 v7, v2
1820; GFX908-NEXT:    v_mov_b32_e32 v10, v1
1821; GFX908-NEXT:    v_mov_b32_e32 v9, v0
1822; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
1823; GFX908-NEXT:    s_mov_b64 s[6:7], exec
1824; GFX908-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
1825; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
1826; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
1827; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
1828; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
1829; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
1830; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
1831; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
1832; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
1833; GFX908-NEXT:    s_nop 0
1834; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
1835; GFX908-NEXT:    ; implicit-def: $vgpr4
1836; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
1837; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
1838; GFX908-NEXT:  ; %bb.2:
1839; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
1840; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
1841; GFX908-NEXT:    s_mov_b64 s[6:7], 0
1842; GFX908-NEXT:  .LBB7_3: ; %atomicrmw.start
1843; GFX908-NEXT:    ; =>This Loop Header: Depth=1
1844; GFX908-NEXT:    ; Child Loop BB7_4 Depth 2
1845; GFX908-NEXT:    s_waitcnt vmcnt(0)
1846; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
1847; GFX908-NEXT:    s_mov_b64 s[12:13], exec
1848; GFX908-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
1849; GFX908-NEXT:    v_mov_b32_e32 v0, v11
1850; GFX908-NEXT:    v_mov_b32_e32 v1, v12
1851; GFX908-NEXT:    v_mov_b32_e32 v2, v13
1852; GFX908-NEXT:    v_mov_b32_e32 v3, v14
1853; GFX908-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
1854; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
1855; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
1856; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
1857; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
1858; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
1859; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
1860; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
1861; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
1862; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
1863; GFX908-NEXT:    s_waitcnt vmcnt(0)
1864; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
1865; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
1866; GFX908-NEXT:    s_cbranch_execnz .LBB7_4
1867; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
1868; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
1869; GFX908-NEXT:    s_waitcnt vmcnt(0)
1870; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
1871; GFX908-NEXT:    v_mov_b32_e32 v14, v1
1872; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
1873; GFX908-NEXT:    v_mov_b32_e32 v13, v0
1874; GFX908-NEXT:    buffer_wbinvl1
1875; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
1876; GFX908-NEXT:    s_cbranch_execnz .LBB7_3
1877; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
1878; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
1879; GFX908-NEXT:    s_setpc_b64 s[30:31]
1880;
1881; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
1882; GFX8:       ; %bb.0:
1883; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1884; GFX8-NEXT:    v_mov_b32_e32 v8, v3
1885; GFX8-NEXT:    v_mov_b32_e32 v7, v2
1886; GFX8-NEXT:    v_mov_b32_e32 v10, v1
1887; GFX8-NEXT:    v_mov_b32_e32 v9, v0
1888; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
1889; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1890; GFX8-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
1891; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
1892; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
1893; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
1894; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
1895; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
1896; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
1897; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
1898; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
1899; GFX8-NEXT:    s_nop 0
1900; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
1901; GFX8-NEXT:    ; implicit-def: $vgpr4
1902; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
1903; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
1904; GFX8-NEXT:  ; %bb.2:
1905; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
1906; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
1907; GFX8-NEXT:    s_mov_b64 s[6:7], 0
1908; GFX8-NEXT:  .LBB7_3: ; %atomicrmw.start
1909; GFX8-NEXT:    ; =>This Loop Header: Depth=1
1910; GFX8-NEXT:    ; Child Loop BB7_4 Depth 2
1911; GFX8-NEXT:    s_waitcnt vmcnt(0)
1912; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
1913; GFX8-NEXT:    s_mov_b64 s[12:13], exec
1914; GFX8-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
1915; GFX8-NEXT:    v_mov_b32_e32 v0, v11
1916; GFX8-NEXT:    v_mov_b32_e32 v1, v12
1917; GFX8-NEXT:    v_mov_b32_e32 v2, v13
1918; GFX8-NEXT:    v_mov_b32_e32 v3, v14
1919; GFX8-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
1920; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
1921; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
1922; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
1923; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
1924; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
1925; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
1926; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
1927; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
1928; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
1929; GFX8-NEXT:    s_waitcnt vmcnt(0)
1930; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
1931; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
1932; GFX8-NEXT:    s_cbranch_execnz .LBB7_4
1933; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
1934; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
1935; GFX8-NEXT:    s_waitcnt vmcnt(0)
1936; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
1937; GFX8-NEXT:    v_mov_b32_e32 v14, v1
1938; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
1939; GFX8-NEXT:    v_mov_b32_e32 v13, v0
1940; GFX8-NEXT:    buffer_wbinvl1
1941; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
1942; GFX8-NEXT:    s_cbranch_execnz .LBB7_3
1943; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
1944; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1945; GFX8-NEXT:    s_setpc_b64 s[30:31]
1946;
1947; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
1948; GFX7:       ; %bb.0:
1949; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1950; GFX7-NEXT:    s_mov_b64 s[6:7], exec
1951; GFX7-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
1952; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
1953; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
1954; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
1955; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
1956; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
1957; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
1958; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
1959; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
1960; GFX7-NEXT:    s_waitcnt vmcnt(0)
1961; GFX7-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
1962; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1963; GFX7-NEXT:    ; implicit-def: $vgpr4
1964; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
1965; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
1966; GFX7-NEXT:  ; %bb.2:
1967; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
1968; GFX7-NEXT:    s_waitcnt vmcnt(0)
1969; GFX7-NEXT:    v_mov_b32_e32 v0, v5
1970; GFX7-NEXT:    v_mov_b32_e32 v1, v6
1971; GFX7-NEXT:    buffer_wbinvl1
1972; GFX7-NEXT:    s_setpc_b64 s[30:31]
1973;
1974; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
1975; GFX6:       ; %bb.0:
1976; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1977; GFX6-NEXT:    s_mov_b64 s[6:7], exec
1978; GFX6-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
1979; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
1980; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
1981; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
1982; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
1983; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
1984; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
1985; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
1986; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
1987; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1988; GFX6-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
1989; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1990; GFX6-NEXT:    ; implicit-def: $vgpr4
1991; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
1992; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
1993; GFX6-NEXT:  ; %bb.2:
1994; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
1995; GFX6-NEXT:    s_waitcnt vmcnt(0)
1996; GFX6-NEXT:    v_mov_b32_e32 v0, v5
1997; GFX6-NEXT:    v_mov_b32_e32 v1, v6
1998; GFX6-NEXT:    buffer_wbinvl1
1999; GFX6-NEXT:    s_waitcnt expcnt(0)
2000; GFX6-NEXT:    s_setpc_b64 s[30:31]
2001  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
2002  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2003  ret double %result
2004}
2005
2006define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
2007; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
2008; GFX12:       ; %bb.0:
2009; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2010; GFX12-NEXT:    s_wait_expcnt 0x0
2011; GFX12-NEXT:    s_wait_samplecnt 0x0
2012; GFX12-NEXT:    s_wait_bvhcnt 0x0
2013; GFX12-NEXT:    s_wait_kmcnt 0x0
2014; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
2015; GFX12-NEXT:    v_mov_b32_e32 v0, s16
2016; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
2017; GFX12-NEXT:    s_wait_alu 0xfffe
2018; GFX12-NEXT:    v_mov_b32_e32 v6, s4
2019; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
2020; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
2021; GFX12-NEXT:    s_mov_b32 s4, 0
2022; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
2023; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2024; GFX12-NEXT:    s_wait_loadcnt 0x0
2025; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
2026; GFX12-NEXT:    s_wait_storecnt 0x0
2027; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2028; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
2029; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
2030; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2031; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
2032; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
2033; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
2034; GFX12-NEXT:    s_wait_loadcnt 0x0
2035; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2036; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
2037; GFX12-NEXT:    s_wait_alu 0xfffe
2038; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
2039; GFX12-NEXT:    s_wait_alu 0xfffe
2040; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
2041; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
2042; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2043; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2044; GFX12-NEXT:    s_wait_alu 0xfffe
2045; GFX12-NEXT:    s_setpc_b64 s[30:31]
2046;
2047; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
2048; GFX940:       ; %bb.0:
2049; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2050; GFX940-NEXT:    v_mov_b32_e32 v2, s16
2051; GFX940-NEXT:    buffer_wbl2 sc1
2052; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
2053; GFX940-NEXT:    s_waitcnt vmcnt(0)
2054; GFX940-NEXT:    buffer_inv sc1
2055; GFX940-NEXT:    s_setpc_b64 s[30:31]
2056;
2057; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
2058; GFX11:       ; %bb.0:
2059; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2060; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
2061; GFX11-NEXT:    v_mov_b32_e32 v0, s16
2062; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
2063; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2064; GFX11-NEXT:    v_mov_b32_e32 v6, s4
2065; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
2066; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
2067; GFX11-NEXT:    s_mov_b32 s4, 0
2068; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
2069; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2070; GFX11-NEXT:    s_waitcnt vmcnt(0)
2071; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
2072; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2073; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2074; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
2075; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
2076; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2077; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
2078; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
2079; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
2080; GFX11-NEXT:    s_waitcnt vmcnt(0)
2081; GFX11-NEXT:    buffer_gl1_inv
2082; GFX11-NEXT:    buffer_gl0_inv
2083; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
2084; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
2085; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2086; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
2087; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
2088; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2089; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2090; GFX11-NEXT:    s_setpc_b64 s[30:31]
2091;
2092; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
2093; GFX10:       ; %bb.0:
2094; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2095; GFX10-NEXT:    v_mov_b32_e32 v2, v0
2096; GFX10-NEXT:    v_mov_b32_e32 v0, s20
2097; GFX10-NEXT:    v_mov_b32_e32 v3, v1
2098; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
2099; GFX10-NEXT:    v_mov_b32_e32 v6, s4
2100; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
2101; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
2102; GFX10-NEXT:    s_mov_b32 s4, 0
2103; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
2104; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2105; GFX10-NEXT:    s_waitcnt vmcnt(0)
2106; GFX10-NEXT:    v_mov_b32_e32 v10, v1
2107; GFX10-NEXT:    v_mov_b32_e32 v9, v0
2108; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2109; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
2110; GFX10-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
2111; GFX10-NEXT:    v_mov_b32_e32 v0, v7
2112; GFX10-NEXT:    v_mov_b32_e32 v1, v8
2113; GFX10-NEXT:    v_mov_b32_e32 v2, v9
2114; GFX10-NEXT:    v_mov_b32_e32 v3, v10
2115; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
2116; GFX10-NEXT:    s_waitcnt vmcnt(0)
2117; GFX10-NEXT:    buffer_gl1_inv
2118; GFX10-NEXT:    buffer_gl0_inv
2119; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
2120; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
2121; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
2122; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
2123; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2124; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2125; GFX10-NEXT:    s_setpc_b64 s[30:31]
2126;
2127; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
2128; GFX90A:       ; %bb.0:
2129; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2130; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
2131; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
2132; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
2133; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
2134; GFX90A-NEXT:    s_add_i32 s6, s20, 0x800
2135; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
2136; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
2137; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
2138; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
2139; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
2140; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2141; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
2142; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
2143; GFX90A-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
2144; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
2145; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
2146; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
2147; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2148; GFX90A-NEXT:    buffer_wbinvl1
2149; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
2150; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2151; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2152; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
2153; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
2154; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2155; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2156;
2157; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
2158; GFX908:       ; %bb.0:
2159; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2160; GFX908-NEXT:    v_mov_b32_e32 v2, v0
2161; GFX908-NEXT:    v_mov_b32_e32 v0, s20
2162; GFX908-NEXT:    v_mov_b32_e32 v3, v1
2163; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
2164; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
2165; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
2166; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2167; GFX908-NEXT:    v_mov_b32_e32 v6, s6
2168; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
2169; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2170; GFX908-NEXT:    s_waitcnt vmcnt(0)
2171; GFX908-NEXT:    v_mov_b32_e32 v10, v1
2172; GFX908-NEXT:    v_mov_b32_e32 v9, v0
2173; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
2174; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
2175; GFX908-NEXT:    v_mov_b32_e32 v0, v7
2176; GFX908-NEXT:    v_mov_b32_e32 v1, v8
2177; GFX908-NEXT:    v_mov_b32_e32 v2, v9
2178; GFX908-NEXT:    v_mov_b32_e32 v3, v10
2179; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
2180; GFX908-NEXT:    s_waitcnt vmcnt(0)
2181; GFX908-NEXT:    buffer_wbinvl1
2182; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
2183; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2184; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2185; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
2186; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2187; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2188; GFX908-NEXT:    s_setpc_b64 s[30:31]
2189;
2190; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
2191; GFX8:       ; %bb.0:
2192; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2193; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2194; GFX8-NEXT:    v_mov_b32_e32 v0, s20
2195; GFX8-NEXT:    v_mov_b32_e32 v3, v1
2196; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
2197; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
2198; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
2199; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2200; GFX8-NEXT:    v_mov_b32_e32 v6, s6
2201; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
2202; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2203; GFX8-NEXT:    s_waitcnt vmcnt(0)
2204; GFX8-NEXT:    v_mov_b32_e32 v10, v1
2205; GFX8-NEXT:    v_mov_b32_e32 v9, v0
2206; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
2207; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
2208; GFX8-NEXT:    v_mov_b32_e32 v0, v7
2209; GFX8-NEXT:    v_mov_b32_e32 v1, v8
2210; GFX8-NEXT:    v_mov_b32_e32 v2, v9
2211; GFX8-NEXT:    v_mov_b32_e32 v3, v10
2212; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
2213; GFX8-NEXT:    s_waitcnt vmcnt(0)
2214; GFX8-NEXT:    buffer_wbinvl1
2215; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
2216; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2217; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2218; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
2219; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2220; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2221; GFX8-NEXT:    s_setpc_b64 s[30:31]
2222;
2223; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
2224; GFX7:       ; %bb.0:
2225; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2226; GFX7-NEXT:    v_mov_b32_e32 v2, v0
2227; GFX7-NEXT:    v_mov_b32_e32 v0, s20
2228; GFX7-NEXT:    v_mov_b32_e32 v3, v1
2229; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
2230; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
2231; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
2232; GFX7-NEXT:    s_mov_b64 s[4:5], 0
2233; GFX7-NEXT:    v_mov_b32_e32 v6, s6
2234; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
2235; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2236; GFX7-NEXT:    s_waitcnt vmcnt(0)
2237; GFX7-NEXT:    v_mov_b32_e32 v10, v1
2238; GFX7-NEXT:    v_mov_b32_e32 v9, v0
2239; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
2240; GFX7-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
2241; GFX7-NEXT:    v_mov_b32_e32 v0, v7
2242; GFX7-NEXT:    v_mov_b32_e32 v1, v8
2243; GFX7-NEXT:    v_mov_b32_e32 v2, v9
2244; GFX7-NEXT:    v_mov_b32_e32 v3, v10
2245; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
2246; GFX7-NEXT:    s_waitcnt vmcnt(0)
2247; GFX7-NEXT:    buffer_wbinvl1
2248; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
2249; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2250; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2251; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
2252; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2253; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2254; GFX7-NEXT:    s_setpc_b64 s[30:31]
2255;
2256; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
2257; GFX6:       ; %bb.0:
2258; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2259; GFX6-NEXT:    v_mov_b32_e32 v2, v0
2260; GFX6-NEXT:    v_mov_b32_e32 v0, s20
2261; GFX6-NEXT:    v_mov_b32_e32 v3, v1
2262; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
2263; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
2264; GFX6-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
2265; GFX6-NEXT:    s_mov_b64 s[4:5], 0
2266; GFX6-NEXT:    v_mov_b32_e32 v6, s6
2267; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
2268; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
2269; GFX6-NEXT:    s_waitcnt vmcnt(0)
2270; GFX6-NEXT:    v_mov_b32_e32 v10, v1
2271; GFX6-NEXT:    v_mov_b32_e32 v9, v0
2272; GFX6-NEXT:    s_waitcnt expcnt(0)
2273; GFX6-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
2274; GFX6-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
2275; GFX6-NEXT:    v_mov_b32_e32 v0, v7
2276; GFX6-NEXT:    v_mov_b32_e32 v1, v8
2277; GFX6-NEXT:    v_mov_b32_e32 v2, v9
2278; GFX6-NEXT:    v_mov_b32_e32 v3, v10
2279; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
2280; GFX6-NEXT:    s_waitcnt vmcnt(0)
2281; GFX6-NEXT:    buffer_wbinvl1
2282; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
2283; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2284; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2285; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
2286; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
2287; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
2288; GFX6-NEXT:    s_waitcnt expcnt(0)
2289; GFX6-NEXT:    s_setpc_b64 s[30:31]
2290  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
2291  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
2292  ret double %result
2293}
2294
2295define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
2296; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
2297; GFX12:       ; %bb.0:
2298; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2299; GFX12-NEXT:    s_wait_expcnt 0x0
2300; GFX12-NEXT:    s_wait_samplecnt 0x0
2301; GFX12-NEXT:    s_wait_bvhcnt 0x0
2302; GFX12-NEXT:    s_wait_kmcnt 0x0
2303; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
2304; GFX12-NEXT:    v_mov_b32_e32 v0, s16
2305; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
2306; GFX12-NEXT:    s_wait_alu 0xfffe
2307; GFX12-NEXT:    v_mov_b32_e32 v6, s4
2308; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
2309; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
2310; GFX12-NEXT:    s_mov_b32 s4, 0
2311; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
2312; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2313; GFX12-NEXT:    s_wait_loadcnt 0x0
2314; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
2315; GFX12-NEXT:    s_wait_storecnt 0x0
2316; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2317; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
2318; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
2319; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2320; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
2321; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
2322; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
2323; GFX12-NEXT:    s_wait_loadcnt 0x0
2324; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2325; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
2326; GFX12-NEXT:    s_wait_alu 0xfffe
2327; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
2328; GFX12-NEXT:    s_wait_alu 0xfffe
2329; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
2330; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
2331; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2332; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2333; GFX12-NEXT:    s_wait_alu 0xfffe
2334; GFX12-NEXT:    s_setpc_b64 s[30:31]
2335;
2336; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
2337; GFX940:       ; %bb.0:
2338; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2339; GFX940-NEXT:    v_mov_b32_e32 v2, s16
2340; GFX940-NEXT:    buffer_wbl2 sc1
2341; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
2342; GFX940-NEXT:    s_waitcnt vmcnt(0)
2343; GFX940-NEXT:    buffer_inv sc1
2344; GFX940-NEXT:    s_setpc_b64 s[30:31]
2345;
2346; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
2347; GFX11:       ; %bb.0:
2348; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2349; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
2350; GFX11-NEXT:    v_mov_b32_e32 v0, s16
2351; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
2352; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2353; GFX11-NEXT:    v_mov_b32_e32 v6, s4
2354; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
2355; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
2356; GFX11-NEXT:    s_mov_b32 s4, 0
2357; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
2358; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2359; GFX11-NEXT:    s_waitcnt vmcnt(0)
2360; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
2361; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2362; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2363; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
2364; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
2365; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2366; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
2367; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
2368; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
2369; GFX11-NEXT:    s_waitcnt vmcnt(0)
2370; GFX11-NEXT:    buffer_gl1_inv
2371; GFX11-NEXT:    buffer_gl0_inv
2372; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
2373; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
2374; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2375; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
2376; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
2377; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2378; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2379; GFX11-NEXT:    s_setpc_b64 s[30:31]
2380;
2381; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
2382; GFX10:       ; %bb.0:
2383; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2384; GFX10-NEXT:    v_mov_b32_e32 v2, s20
2385; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2386; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc
2387; GFX10-NEXT:    s_waitcnt vmcnt(0)
2388; GFX10-NEXT:    buffer_gl1_inv
2389; GFX10-NEXT:    buffer_gl0_inv
2390; GFX10-NEXT:    s_setpc_b64 s[30:31]
2391;
2392; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
2393; GFX90A:       ; %bb.0:
2394; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2395; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
2396; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc
2397; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2398; GFX90A-NEXT:    buffer_wbinvl1
2399; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2400;
2401; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
2402; GFX908:       ; %bb.0:
2403; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2404; GFX908-NEXT:    v_mov_b32_e32 v2, v0
2405; GFX908-NEXT:    v_mov_b32_e32 v0, s20
2406; GFX908-NEXT:    v_mov_b32_e32 v3, v1
2407; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
2408; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
2409; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
2410; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2411; GFX908-NEXT:    v_mov_b32_e32 v6, s6
2412; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
2413; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2414; GFX908-NEXT:    s_waitcnt vmcnt(0)
2415; GFX908-NEXT:    v_mov_b32_e32 v10, v1
2416; GFX908-NEXT:    v_mov_b32_e32 v9, v0
2417; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
2418; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
2419; GFX908-NEXT:    v_mov_b32_e32 v0, v7
2420; GFX908-NEXT:    v_mov_b32_e32 v1, v8
2421; GFX908-NEXT:    v_mov_b32_e32 v2, v9
2422; GFX908-NEXT:    v_mov_b32_e32 v3, v10
2423; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
2424; GFX908-NEXT:    s_waitcnt vmcnt(0)
2425; GFX908-NEXT:    buffer_wbinvl1
2426; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
2427; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2428; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2429; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
2430; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2431; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2432; GFX908-NEXT:    s_setpc_b64 s[30:31]
2433;
2434; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
2435; GFX8:       ; %bb.0:
2436; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2437; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2438; GFX8-NEXT:    v_mov_b32_e32 v0, s20
2439; GFX8-NEXT:    v_mov_b32_e32 v3, v1
2440; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
2441; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
2442; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
2443; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2444; GFX8-NEXT:    v_mov_b32_e32 v6, s6
2445; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
2446; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2447; GFX8-NEXT:    s_waitcnt vmcnt(0)
2448; GFX8-NEXT:    v_mov_b32_e32 v10, v1
2449; GFX8-NEXT:    v_mov_b32_e32 v9, v0
2450; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
2451; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
2452; GFX8-NEXT:    v_mov_b32_e32 v0, v7
2453; GFX8-NEXT:    v_mov_b32_e32 v1, v8
2454; GFX8-NEXT:    v_mov_b32_e32 v2, v9
2455; GFX8-NEXT:    v_mov_b32_e32 v3, v10
2456; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
2457; GFX8-NEXT:    s_waitcnt vmcnt(0)
2458; GFX8-NEXT:    buffer_wbinvl1
2459; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
2460; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2461; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2462; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
2463; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2464; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2465; GFX8-NEXT:    s_setpc_b64 s[30:31]
2466;
2467; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
2468; GFX7:       ; %bb.0:
2469; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2470; GFX7-NEXT:    v_mov_b32_e32 v2, s20
2471; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc
2472; GFX7-NEXT:    s_waitcnt vmcnt(0)
2473; GFX7-NEXT:    buffer_wbinvl1
2474; GFX7-NEXT:    s_setpc_b64 s[30:31]
2475;
2476; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
2477; GFX6:       ; %bb.0:
2478; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2479; GFX6-NEXT:    v_mov_b32_e32 v2, s20
2480; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc
2481; GFX6-NEXT:    s_waitcnt vmcnt(0)
2482; GFX6-NEXT:    buffer_wbinvl1
2483; GFX6-NEXT:    s_waitcnt expcnt(0)
2484; GFX6-NEXT:    s_setpc_b64 s[30:31]
2485  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
2486  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
2487  ret double %result
2488}
2489
2490; --------------------------------------------------------------------
2491; half
2492; --------------------------------------------------------------------
2493
2494define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
2495; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
2496; GFX12:       ; %bb.0:
2497; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2498; GFX12-NEXT:    s_wait_expcnt 0x0
2499; GFX12-NEXT:    s_wait_samplecnt 0x0
2500; GFX12-NEXT:    s_wait_bvhcnt 0x0
2501; GFX12-NEXT:    s_wait_kmcnt 0x0
2502; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
2503; GFX12-NEXT:    v_max_num_f16_e32 v5, v0, v0
2504; GFX12-NEXT:    s_wait_alu 0xfffe
2505; GFX12-NEXT:    s_and_b32 s4, s16, -4
2506; GFX12-NEXT:    s_wait_alu 0xfffe
2507; GFX12-NEXT:    v_mov_b32_e32 v4, s4
2508; GFX12-NEXT:    s_and_b32 s4, s16, 3
2509; GFX12-NEXT:    s_wait_alu 0xfffe
2510; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
2511; GFX12-NEXT:    s_wait_alu 0xfffe
2512; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
2513; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
2514; GFX12-NEXT:    s_wait_alu 0xfffe
2515; GFX12-NEXT:    s_not_b32 s6, s5
2516; GFX12-NEXT:    s_mov_b32 s5, 0
2517; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
2518; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2519; GFX12-NEXT:    s_wait_loadcnt 0x0
2520; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
2521; GFX12-NEXT:    s_wait_storecnt 0x0
2522; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2523; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
2524; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v5
2525; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2526; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2527; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
2528; GFX12-NEXT:    s_wait_alu 0xfffe
2529; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2530; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
2531; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
2532; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
2533; GFX12-NEXT:    s_wait_loadcnt 0x0
2534; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2535; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
2536; GFX12-NEXT:    v_mov_b32_e32 v1, v2
2537; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
2538; GFX12-NEXT:    s_wait_alu 0xfffe
2539; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
2540; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
2541; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2542; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
2543; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
2544; GFX12-NEXT:    s_wait_alu 0xfffe
2545; GFX12-NEXT:    s_setpc_b64 s[30:31]
2546;
2547; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
2548; GFX940:       ; %bb.0:
2549; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2550; GFX940-NEXT:    s_addk_i32 s16, 0x200
2551; GFX940-NEXT:    s_and_b32 s4, s16, -4
2552; GFX940-NEXT:    v_mov_b32_e32 v4, s4
2553; GFX940-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
2554; GFX940-NEXT:    s_and_b32 s4, s16, 3
2555; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
2556; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
2557; GFX940-NEXT:    s_not_b32 s7, s4
2558; GFX940-NEXT:    s_mov_b64 s[4:5], 0
2559; GFX940-NEXT:    v_max_f16_e32 v5, v0, v0
2560; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
2561; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
2562; GFX940-NEXT:    s_waitcnt vmcnt(0)
2563; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
2564; GFX940-NEXT:    v_max_f16_e32 v0, v0, v0
2565; GFX940-NEXT:    v_min_f16_e32 v0, v0, v5
2566; GFX940-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
2567; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
2568; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
2569; GFX940-NEXT:    buffer_wbl2 sc1
2570; GFX940-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
2571; GFX940-NEXT:    s_waitcnt vmcnt(0)
2572; GFX940-NEXT:    buffer_inv sc1
2573; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
2574; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2575; GFX940-NEXT:    v_mov_b32_e32 v1, v2
2576; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2577; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
2578; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
2579; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
2580; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
2581; GFX940-NEXT:    s_setpc_b64 s[30:31]
2582;
2583; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
2584; GFX11:       ; %bb.0:
2585; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2586; GFX11-NEXT:    s_addk_i32 s16, 0x200
2587; GFX11-NEXT:    v_max_f16_e32 v5, v0, v0
2588; GFX11-NEXT:    s_and_b32 s4, s16, -4
2589; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2590; GFX11-NEXT:    v_mov_b32_e32 v4, s4
2591; GFX11-NEXT:    s_and_b32 s4, s16, 3
2592; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
2593; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2594; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
2595; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
2596; GFX11-NEXT:    s_not_b32 s6, s5
2597; GFX11-NEXT:    s_mov_b32 s5, 0
2598; GFX11-NEXT:    .p2align 6
2599; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
2600; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2601; GFX11-NEXT:    s_waitcnt vmcnt(0)
2602; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
2603; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2604; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2605; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
2606; GFX11-NEXT:    v_min_f16_e32 v0, v0, v5
2607; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2608; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2609; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
2610; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2611; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
2612; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
2613; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
2614; GFX11-NEXT:    s_waitcnt vmcnt(0)
2615; GFX11-NEXT:    buffer_gl1_inv
2616; GFX11-NEXT:    buffer_gl0_inv
2617; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
2618; GFX11-NEXT:    v_mov_b32_e32 v1, v2
2619; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
2620; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2621; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
2622; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
2623; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2624; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
2625; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
2626; GFX11-NEXT:    s_setpc_b64 s[30:31]
2627;
2628; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
2629; GFX10:       ; %bb.0:
2630; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2631; GFX10-NEXT:    s_addk_i32 s20, 0x200
2632; GFX10-NEXT:    v_max_f16_e32 v5, v0, v0
2633; GFX10-NEXT:    s_and_b32 s4, s20, -4
2634; GFX10-NEXT:    v_mov_b32_e32 v4, s4
2635; GFX10-NEXT:    s_and_b32 s4, s20, 3
2636; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
2637; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
2638; GFX10-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
2639; GFX10-NEXT:    s_not_b32 s6, s5
2640; GFX10-NEXT:    s_mov_b32 s5, 0
2641; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
2642; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2643; GFX10-NEXT:    s_waitcnt vmcnt(0)
2644; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
2645; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2646; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
2647; GFX10-NEXT:    v_min_f16_e32 v0, v0, v5
2648; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2649; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
2650; GFX10-NEXT:    v_mov_b32_e32 v3, v1
2651; GFX10-NEXT:    v_mov_b32_e32 v2, v0
2652; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
2653; GFX10-NEXT:    s_waitcnt vmcnt(0)
2654; GFX10-NEXT:    buffer_gl1_inv
2655; GFX10-NEXT:    buffer_gl0_inv
2656; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
2657; GFX10-NEXT:    v_mov_b32_e32 v1, v2
2658; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
2659; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
2660; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
2661; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2662; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
2663; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
2664; GFX10-NEXT:    s_setpc_b64 s[30:31]
2665;
2666; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
2667; GFX90A:       ; %bb.0:
2668; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2669; GFX90A-NEXT:    s_addk_i32 s20, 0x200
2670; GFX90A-NEXT:    s_and_b32 s4, s20, -4
2671; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
2672; GFX90A-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
2673; GFX90A-NEXT:    s_and_b32 s4, s20, 3
2674; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
2675; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
2676; GFX90A-NEXT:    s_not_b32 s7, s4
2677; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
2678; GFX90A-NEXT:    v_max_f16_e32 v5, v0, v0
2679; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
2680; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
2681; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2682; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
2683; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
2684; GFX90A-NEXT:    v_min_f16_e32 v0, v0, v5
2685; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
2686; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
2687; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
2688; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
2689; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2690; GFX90A-NEXT:    buffer_wbinvl1
2691; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
2692; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2693; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
2694; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2695; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
2696; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
2697; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2698; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
2699; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2700;
2701; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
2702; GFX908:       ; %bb.0:
2703; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2704; GFX908-NEXT:    s_addk_i32 s20, 0x200
2705; GFX908-NEXT:    s_and_b32 s4, s20, -4
2706; GFX908-NEXT:    v_mov_b32_e32 v4, s4
2707; GFX908-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
2708; GFX908-NEXT:    s_and_b32 s4, s20, 3
2709; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
2710; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
2711; GFX908-NEXT:    s_not_b32 s7, s4
2712; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2713; GFX908-NEXT:    v_max_f16_e32 v5, v0, v0
2714; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
2715; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2716; GFX908-NEXT:    s_waitcnt vmcnt(0)
2717; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
2718; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
2719; GFX908-NEXT:    v_min_f16_e32 v0, v0, v5
2720; GFX908-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
2721; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
2722; GFX908-NEXT:    v_mov_b32_e32 v3, v1
2723; GFX908-NEXT:    v_mov_b32_e32 v2, v0
2724; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
2725; GFX908-NEXT:    s_waitcnt vmcnt(0)
2726; GFX908-NEXT:    buffer_wbinvl1
2727; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
2728; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2729; GFX908-NEXT:    v_mov_b32_e32 v1, v2
2730; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2731; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
2732; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2733; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2734; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
2735; GFX908-NEXT:    s_setpc_b64 s[30:31]
2736;
2737; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
2738; GFX8:       ; %bb.0:
2739; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2740; GFX8-NEXT:    s_addk_i32 s20, 0x200
2741; GFX8-NEXT:    s_and_b32 s4, s20, -4
2742; GFX8-NEXT:    v_mov_b32_e32 v4, s4
2743; GFX8-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
2744; GFX8-NEXT:    s_and_b32 s4, s20, 3
2745; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
2746; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
2747; GFX8-NEXT:    s_not_b32 s7, s4
2748; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2749; GFX8-NEXT:    v_max_f16_e32 v5, v0, v0
2750; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
2751; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2752; GFX8-NEXT:    s_waitcnt vmcnt(0)
2753; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
2754; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
2755; GFX8-NEXT:    v_min_f16_e32 v0, v0, v5
2756; GFX8-NEXT:    v_and_b32_e32 v2, s7, v1
2757; GFX8-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
2758; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
2759; GFX8-NEXT:    v_mov_b32_e32 v3, v1
2760; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2761; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
2762; GFX8-NEXT:    s_waitcnt vmcnt(0)
2763; GFX8-NEXT:    buffer_wbinvl1
2764; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
2765; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2766; GFX8-NEXT:    v_mov_b32_e32 v1, v2
2767; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2768; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
2769; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2770; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2771; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
2772; GFX8-NEXT:    s_setpc_b64 s[30:31]
2773;
2774; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
2775; GFX7:       ; %bb.0:
2776; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2777; GFX7-NEXT:    s_addk_i32 s20, 0x200
2778; GFX7-NEXT:    s_and_b32 s4, s20, -4
2779; GFX7-NEXT:    v_mov_b32_e32 v4, s4
2780; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
2781; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
2782; GFX7-NEXT:    s_and_b32 s4, s20, 3
2783; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
2784; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
2785; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
2786; GFX7-NEXT:    s_not_b32 s7, s4
2787; GFX7-NEXT:    s_mov_b64 s[4:5], 0
2788; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
2789; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
2790; GFX7-NEXT:    s_waitcnt vmcnt(0)
2791; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
2792; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
2793; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
2794; GFX7-NEXT:    v_min_f32_e32 v0, v0, v5
2795; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
2796; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
2797; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
2798; GFX7-NEXT:    v_mov_b32_e32 v3, v1
2799; GFX7-NEXT:    v_mov_b32_e32 v2, v0
2800; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
2801; GFX7-NEXT:    s_waitcnt vmcnt(0)
2802; GFX7-NEXT:    buffer_wbinvl1
2803; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
2804; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2805; GFX7-NEXT:    v_mov_b32_e32 v1, v2
2806; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2807; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
2808; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
2809; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
2810; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
2811; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
2812; GFX7-NEXT:    s_setpc_b64 s[30:31]
2813;
2814; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
2815; GFX6:       ; %bb.0:
2816; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2817; GFX6-NEXT:    s_addk_i32 s20, 0x200
2818; GFX6-NEXT:    s_and_b32 s4, s20, -4
2819; GFX6-NEXT:    v_mov_b32_e32 v4, s4
2820; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
2821; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
2822; GFX6-NEXT:    s_and_b32 s4, s20, 3
2823; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
2824; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
2825; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
2826; GFX6-NEXT:    s_not_b32 s7, s4
2827; GFX6-NEXT:    s_mov_b64 s[4:5], 0
2828; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
2829; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
2830; GFX6-NEXT:    s_waitcnt vmcnt(0)
2831; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
2832; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
2833; GFX6-NEXT:    s_waitcnt expcnt(0)
2834; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
2835; GFX6-NEXT:    v_min_f32_e32 v0, v0, v5
2836; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
2837; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
2838; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
2839; GFX6-NEXT:    v_mov_b32_e32 v3, v1
2840; GFX6-NEXT:    v_mov_b32_e32 v2, v0
2841; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
2842; GFX6-NEXT:    s_waitcnt vmcnt(0)
2843; GFX6-NEXT:    buffer_wbinvl1
2844; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
2845; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2846; GFX6-NEXT:    v_mov_b32_e32 v1, v2
2847; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2848; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
2849; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
2850; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
2851; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
2852; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
2853; GFX6-NEXT:    s_waitcnt expcnt(0)
2854; GFX6-NEXT:    s_setpc_b64 s[30:31]
2855  %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
2856  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2857  ret half %result
2858}
2859
2860define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
2861; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
2862; GFX12:       ; %bb.0:
2863; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2864; GFX12-NEXT:    s_wait_expcnt 0x0
2865; GFX12-NEXT:    s_wait_samplecnt 0x0
2866; GFX12-NEXT:    s_wait_bvhcnt 0x0
2867; GFX12-NEXT:    s_wait_kmcnt 0x0
2868; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
2869; GFX12-NEXT:    v_max_num_f16_e32 v3, v0, v0
2870; GFX12-NEXT:    s_wait_alu 0xfffe
2871; GFX12-NEXT:    s_and_b32 s4, s16, -4
2872; GFX12-NEXT:    s_wait_alu 0xfffe
2873; GFX12-NEXT:    v_mov_b32_e32 v2, s4
2874; GFX12-NEXT:    s_and_b32 s4, s16, 3
2875; GFX12-NEXT:    s_wait_alu 0xfffe
2876; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
2877; GFX12-NEXT:    s_wait_alu 0xfffe
2878; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
2879; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
2880; GFX12-NEXT:    s_wait_alu 0xfffe
2881; GFX12-NEXT:    s_not_b32 s6, s5
2882; GFX12-NEXT:    s_mov_b32 s5, 0
2883; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
2884; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2885; GFX12-NEXT:    s_wait_loadcnt 0x0
2886; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
2887; GFX12-NEXT:    s_wait_storecnt 0x0
2888; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2889; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
2890; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v3
2891; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2892; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2893; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
2894; GFX12-NEXT:    s_wait_alu 0xfffe
2895; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2896; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
2897; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
2898; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
2899; GFX12-NEXT:    s_wait_loadcnt 0x0
2900; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2901; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
2902; GFX12-NEXT:    v_mov_b32_e32 v1, v4
2903; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
2904; GFX12-NEXT:    s_wait_alu 0xfffe
2905; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
2906; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
2907; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2908; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
2909; GFX12-NEXT:    s_wait_alu 0xfffe
2910; GFX12-NEXT:    s_setpc_b64 s[30:31]
2911;
2912; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
2913; GFX940:       ; %bb.0:
2914; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2915; GFX940-NEXT:    s_addk_i32 s16, 0x200
2916; GFX940-NEXT:    s_and_b32 s4, s16, -4
2917; GFX940-NEXT:    v_mov_b32_e32 v2, s4
2918; GFX940-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
2919; GFX940-NEXT:    s_and_b32 s4, s16, 3
2920; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
2921; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
2922; GFX940-NEXT:    s_not_b32 s7, s4
2923; GFX940-NEXT:    s_mov_b64 s[4:5], 0
2924; GFX940-NEXT:    v_max_f16_e32 v3, v0, v0
2925; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
2926; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
2927; GFX940-NEXT:    s_waitcnt vmcnt(0)
2928; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
2929; GFX940-NEXT:    v_max_f16_e32 v0, v0, v0
2930; GFX940-NEXT:    v_min_f16_e32 v0, v0, v3
2931; GFX940-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
2932; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
2933; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
2934; GFX940-NEXT:    buffer_wbl2 sc1
2935; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
2936; GFX940-NEXT:    s_waitcnt vmcnt(0)
2937; GFX940-NEXT:    buffer_inv sc1
2938; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
2939; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2940; GFX940-NEXT:    v_mov_b32_e32 v1, v4
2941; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2942; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
2943; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
2944; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
2945; GFX940-NEXT:    s_setpc_b64 s[30:31]
2946;
2947; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
2948; GFX11:       ; %bb.0:
2949; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2950; GFX11-NEXT:    s_addk_i32 s16, 0x200
2951; GFX11-NEXT:    v_max_f16_e32 v3, v0, v0
2952; GFX11-NEXT:    s_and_b32 s4, s16, -4
2953; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2954; GFX11-NEXT:    v_mov_b32_e32 v2, s4
2955; GFX11-NEXT:    s_and_b32 s4, s16, 3
2956; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
2957; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2958; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
2959; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
2960; GFX11-NEXT:    s_not_b32 s6, s5
2961; GFX11-NEXT:    s_mov_b32 s5, 0
2962; GFX11-NEXT:    .p2align 6
2963; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
2964; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2965; GFX11-NEXT:    s_waitcnt vmcnt(0)
2966; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
2967; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2968; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2969; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
2970; GFX11-NEXT:    v_min_f16_e32 v0, v0, v3
2971; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2972; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2973; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
2974; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2975; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
2976; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
2977; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
2978; GFX11-NEXT:    s_waitcnt vmcnt(0)
2979; GFX11-NEXT:    buffer_gl1_inv
2980; GFX11-NEXT:    buffer_gl0_inv
2981; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
2982; GFX11-NEXT:    v_mov_b32_e32 v1, v4
2983; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
2984; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2985; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
2986; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
2987; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2988; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
2989; GFX11-NEXT:    s_setpc_b64 s[30:31]
2990;
2991; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
2992; GFX10:       ; %bb.0:
2993; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2994; GFX10-NEXT:    s_addk_i32 s20, 0x200
2995; GFX10-NEXT:    v_max_f16_e32 v3, v0, v0
2996; GFX10-NEXT:    s_and_b32 s4, s20, -4
2997; GFX10-NEXT:    v_mov_b32_e32 v2, s4
2998; GFX10-NEXT:    s_and_b32 s4, s20, 3
2999; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
3000; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
3001; GFX10-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
3002; GFX10-NEXT:    s_not_b32 s6, s5
3003; GFX10-NEXT:    s_mov_b32 s5, 0
3004; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
3005; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3006; GFX10-NEXT:    s_waitcnt vmcnt(0)
3007; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
3008; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3009; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
3010; GFX10-NEXT:    v_min_f16_e32 v0, v0, v3
3011; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
3012; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
3013; GFX10-NEXT:    v_mov_b32_e32 v5, v1
3014; GFX10-NEXT:    v_mov_b32_e32 v4, v0
3015; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
3016; GFX10-NEXT:    s_waitcnt vmcnt(0)
3017; GFX10-NEXT:    buffer_gl1_inv
3018; GFX10-NEXT:    buffer_gl0_inv
3019; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
3020; GFX10-NEXT:    v_mov_b32_e32 v1, v4
3021; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
3022; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
3023; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
3024; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3025; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
3026; GFX10-NEXT:    s_setpc_b64 s[30:31]
3027;
3028; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
3029; GFX90A:       ; %bb.0:
3030; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3031; GFX90A-NEXT:    s_addk_i32 s20, 0x200
3032; GFX90A-NEXT:    s_and_b32 s4, s20, -4
3033; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
3034; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
3035; GFX90A-NEXT:    s_and_b32 s4, s20, 3
3036; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
3037; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
3038; GFX90A-NEXT:    s_not_b32 s7, s4
3039; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
3040; GFX90A-NEXT:    v_max_f16_e32 v3, v0, v0
3041; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
3042; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
3043; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3044; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
3045; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
3046; GFX90A-NEXT:    v_min_f16_e32 v0, v0, v3
3047; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
3048; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
3049; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
3050; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
3051; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3052; GFX90A-NEXT:    buffer_wbinvl1
3053; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
3054; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3055; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
3056; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3057; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
3058; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
3059; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3060; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3061;
3062; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
3063; GFX908:       ; %bb.0:
3064; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3065; GFX908-NEXT:    s_addk_i32 s20, 0x200
3066; GFX908-NEXT:    s_and_b32 s4, s20, -4
3067; GFX908-NEXT:    v_mov_b32_e32 v2, s4
3068; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
3069; GFX908-NEXT:    s_and_b32 s4, s20, 3
3070; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
3071; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
3072; GFX908-NEXT:    s_not_b32 s7, s4
3073; GFX908-NEXT:    s_mov_b64 s[4:5], 0
3074; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
3075; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
3076; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
3077; GFX908-NEXT:    s_waitcnt vmcnt(0)
3078; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
3079; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
3080; GFX908-NEXT:    v_min_f16_e32 v0, v0, v3
3081; GFX908-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
3082; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
3083; GFX908-NEXT:    v_mov_b32_e32 v5, v1
3084; GFX908-NEXT:    v_mov_b32_e32 v4, v0
3085; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
3086; GFX908-NEXT:    s_waitcnt vmcnt(0)
3087; GFX908-NEXT:    buffer_wbinvl1
3088; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
3089; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3090; GFX908-NEXT:    v_mov_b32_e32 v1, v4
3091; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3092; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
3093; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
3094; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3095; GFX908-NEXT:    s_setpc_b64 s[30:31]
3096;
3097; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
3098; GFX8:       ; %bb.0:
3099; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3100; GFX8-NEXT:    s_addk_i32 s20, 0x200
3101; GFX8-NEXT:    s_and_b32 s4, s20, -4
3102; GFX8-NEXT:    v_mov_b32_e32 v2, s4
3103; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
3104; GFX8-NEXT:    s_and_b32 s4, s20, 3
3105; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
3106; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
3107; GFX8-NEXT:    s_not_b32 s7, s4
3108; GFX8-NEXT:    s_mov_b64 s[4:5], 0
3109; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
3110; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
3111; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3112; GFX8-NEXT:    s_waitcnt vmcnt(0)
3113; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
3114; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
3115; GFX8-NEXT:    v_min_f16_e32 v0, v0, v3
3116; GFX8-NEXT:    v_and_b32_e32 v4, s7, v1
3117; GFX8-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
3118; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
3119; GFX8-NEXT:    v_mov_b32_e32 v5, v1
3120; GFX8-NEXT:    v_mov_b32_e32 v4, v0
3121; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
3122; GFX8-NEXT:    s_waitcnt vmcnt(0)
3123; GFX8-NEXT:    buffer_wbinvl1
3124; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
3125; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3126; GFX8-NEXT:    v_mov_b32_e32 v1, v4
3127; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3128; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
3129; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
3130; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3131; GFX8-NEXT:    s_setpc_b64 s[30:31]
3132;
3133; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
3134; GFX7:       ; %bb.0:
3135; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3136; GFX7-NEXT:    s_addk_i32 s20, 0x200
3137; GFX7-NEXT:    s_and_b32 s4, s20, -4
3138; GFX7-NEXT:    v_mov_b32_e32 v2, s4
3139; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
3140; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
3141; GFX7-NEXT:    s_and_b32 s4, s20, 3
3142; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
3143; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
3144; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
3145; GFX7-NEXT:    s_not_b32 s7, s4
3146; GFX7-NEXT:    s_mov_b64 s[4:5], 0
3147; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
3148; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
3149; GFX7-NEXT:    s_waitcnt vmcnt(0)
3150; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
3151; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
3152; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
3153; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
3154; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
3155; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
3156; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
3157; GFX7-NEXT:    v_mov_b32_e32 v5, v1
3158; GFX7-NEXT:    v_mov_b32_e32 v4, v0
3159; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
3160; GFX7-NEXT:    s_waitcnt vmcnt(0)
3161; GFX7-NEXT:    buffer_wbinvl1
3162; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
3163; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3164; GFX7-NEXT:    v_mov_b32_e32 v1, v4
3165; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3166; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
3167; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
3168; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3169; GFX7-NEXT:    s_setpc_b64 s[30:31]
3170;
3171; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
3172; GFX6:       ; %bb.0:
3173; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3174; GFX6-NEXT:    s_addk_i32 s20, 0x200
3175; GFX6-NEXT:    s_and_b32 s4, s20, -4
3176; GFX6-NEXT:    v_mov_b32_e32 v2, s4
3177; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
3178; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
3179; GFX6-NEXT:    s_and_b32 s4, s20, 3
3180; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
3181; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
3182; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
3183; GFX6-NEXT:    s_not_b32 s7, s4
3184; GFX6-NEXT:    s_mov_b64 s[4:5], 0
3185; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
3186; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
3187; GFX6-NEXT:    s_waitcnt vmcnt(0)
3188; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
3189; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
3190; GFX6-NEXT:    s_waitcnt expcnt(0)
3191; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
3192; GFX6-NEXT:    v_min_f32_e32 v0, v0, v3
3193; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
3194; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
3195; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
3196; GFX6-NEXT:    v_mov_b32_e32 v5, v1
3197; GFX6-NEXT:    v_mov_b32_e32 v4, v0
3198; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
3199; GFX6-NEXT:    s_waitcnt vmcnt(0)
3200; GFX6-NEXT:    buffer_wbinvl1
3201; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
3202; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3203; GFX6-NEXT:    v_mov_b32_e32 v1, v4
3204; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3205; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
3206; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
3207; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
3208; GFX6-NEXT:    s_waitcnt expcnt(0)
3209; GFX6-NEXT:    s_setpc_b64 s[30:31]
3210  %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
3211  %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
3212  ret void
3213}
3214
3215define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 {
3216; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
3217; GFX12:       ; %bb.0:
3218; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3219; GFX12-NEXT:    s_wait_expcnt 0x0
3220; GFX12-NEXT:    s_wait_samplecnt 0x0
3221; GFX12-NEXT:    s_wait_bvhcnt 0x0
3222; GFX12-NEXT:    s_wait_kmcnt 0x0
3223; GFX12-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
3224; GFX12-NEXT:    s_mov_b32 s1, exec_lo
3225; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3226; GFX12-NEXT:    v_and_b32_e32 v6, 3, v4
3227; GFX12-NEXT:    v_and_b32_e32 v8, -4, v4
3228; GFX12-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
3229; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3230; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
3231; GFX12-NEXT:    v_not_b32_e32 v9, v6
3232; GFX12-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
3233; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
3234; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
3235; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
3236; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
3237; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3238; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
3239; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
3240; GFX12-NEXT:    s_wait_alu 0xfffe
3241; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3242; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
3243; GFX12-NEXT:    s_wait_alu 0xfffe
3244; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
3245; GFX12-NEXT:    s_wait_loadcnt 0x0
3246; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
3247; GFX12-NEXT:    s_wait_alu 0xfffe
3248; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
3249; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
3250; GFX12-NEXT:  ; %bb.2:
3251; GFX12-NEXT:    s_mov_b32 exec_lo, s1
3252; GFX12-NEXT:    v_max_num_f16_e32 v10, v5, v5
3253; GFX12-NEXT:    s_mov_b32 s1, 0
3254; GFX12-NEXT:  .LBB12_3: ; %atomicrmw.start
3255; GFX12-NEXT:    ; =>This Loop Header: Depth=1
3256; GFX12-NEXT:    ; Child Loop BB12_4 Depth 2
3257; GFX12-NEXT:    s_wait_loadcnt 0x0
3258; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
3259; GFX12-NEXT:    s_mov_b32 s2, exec_lo
3260; GFX12-NEXT:    s_wait_storecnt 0x0
3261; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3262; GFX12-NEXT:    v_max_num_f16_e32 v4, v4, v4
3263; GFX12-NEXT:    v_min_num_f16_e32 v4, v4, v10
3264; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3265; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
3266; GFX12-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
3267; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3268; GFX12-NEXT:    v_and_or_b32 v5, v6, v9, v4
3269; GFX12-NEXT:    v_mov_b32_e32 v4, v5
3270; GFX12-NEXT:    v_mov_b32_e32 v5, v6
3271; GFX12-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
3272; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
3273; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
3274; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
3275; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
3276; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
3277; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3278; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
3279; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
3280; GFX12-NEXT:    s_wait_alu 0xfffe
3281; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3282; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
3283; GFX12-NEXT:    s_wait_alu 0xfffe
3284; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
3285; GFX12-NEXT:    s_wait_loadcnt 0x0
3286; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
3287; GFX12-NEXT:    s_wait_alu 0xfffe
3288; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
3289; GFX12-NEXT:    s_cbranch_execnz .LBB12_4
3290; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
3291; GFX12-NEXT:    s_mov_b32 exec_lo, s2
3292; GFX12-NEXT:    s_wait_loadcnt 0x0
3293; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
3294; GFX12-NEXT:    v_mov_b32_e32 v6, v4
3295; GFX12-NEXT:    global_inv scope:SCOPE_DEV
3296; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
3297; GFX12-NEXT:    s_wait_alu 0xfffe
3298; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
3299; GFX12-NEXT:    s_cbranch_execnz .LBB12_3
3300; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
3301; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
3302; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
3303; GFX12-NEXT:    s_wait_alu 0xfffe
3304; GFX12-NEXT:    s_setpc_b64 s[30:31]
3305;
3306; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
3307; GFX940:       ; %bb.0:
3308; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3309; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
3310; GFX940-NEXT:    v_and_b32_e32 v9, -4, v4
3311; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
3312; GFX940-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
3313; GFX940-NEXT:    s_mov_b32 s0, 0xffff
3314; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
3315; GFX940-NEXT:    v_not_b32_e32 v10, v4
3316; GFX940-NEXT:    s_mov_b64 s[2:3], exec
3317; GFX940-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
3318; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
3319; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
3320; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
3321; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
3322; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
3323; GFX940-NEXT:    s_nop 0
3324; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
3325; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
3326; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
3327; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
3328; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
3329; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
3330; GFX940-NEXT:  ; %bb.2:
3331; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
3332; GFX940-NEXT:    s_mov_b64 s[2:3], 0
3333; GFX940-NEXT:    v_max_f16_e32 v11, v5, v5
3334; GFX940-NEXT:  .LBB12_3: ; %atomicrmw.start
3335; GFX940-NEXT:    ; =>This Loop Header: Depth=1
3336; GFX940-NEXT:    ; Child Loop BB12_4 Depth 2
3337; GFX940-NEXT:    s_waitcnt vmcnt(0)
3338; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
3339; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
3340; GFX940-NEXT:    v_min_f16_e32 v4, v4, v11
3341; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v8, v4
3342; GFX940-NEXT:    v_and_or_b32 v6, v7, v10, v4
3343; GFX940-NEXT:    s_mov_b64 s[8:9], exec
3344; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
3345; GFX940-NEXT:    buffer_wbl2 sc1
3346; GFX940-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
3347; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
3348; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
3349; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
3350; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
3351; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
3352; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
3353; GFX940-NEXT:    s_nop 0
3354; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
3355; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
3356; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
3357; GFX940-NEXT:    s_waitcnt vmcnt(0)
3358; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
3359; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
3360; GFX940-NEXT:    s_cbranch_execnz .LBB12_4
3361; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
3362; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
3363; GFX940-NEXT:    s_waitcnt vmcnt(0)
3364; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
3365; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
3366; GFX940-NEXT:    v_mov_b32_e32 v7, v4
3367; GFX940-NEXT:    buffer_inv sc1
3368; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
3369; GFX940-NEXT:    s_cbranch_execnz .LBB12_3
3370; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
3371; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
3372; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
3373; GFX940-NEXT:    s_setpc_b64 s[30:31]
3374;
3375; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
3376; GFX11:       ; %bb.0:
3377; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3378; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
3379; GFX11-NEXT:    s_mov_b32 s1, 0
3380; GFX11-NEXT:    s_mov_b32 s2, exec_lo
3381; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3382; GFX11-NEXT:    v_and_b32_e32 v6, 3, v4
3383; GFX11-NEXT:    v_and_b32_e32 v8, -4, v4
3384; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
3385; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3386; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
3387; GFX11-NEXT:    v_not_b32_e32 v9, v6
3388; GFX11-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
3389; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
3390; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
3391; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
3392; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
3393; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3394; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
3395; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
3396; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3397; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
3398; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
3399; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
3400; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
3401; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
3402; GFX11-NEXT:  ; %bb.2:
3403; GFX11-NEXT:    s_mov_b32 exec_lo, s2
3404; GFX11-NEXT:    v_max_f16_e32 v10, v5, v5
3405; GFX11-NEXT:    .p2align 6
3406; GFX11-NEXT:  .LBB12_3: ; %atomicrmw.start
3407; GFX11-NEXT:    ; =>This Loop Header: Depth=1
3408; GFX11-NEXT:    ; Child Loop BB12_4 Depth 2
3409; GFX11-NEXT:    s_waitcnt vmcnt(0)
3410; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
3411; GFX11-NEXT:    s_mov_b32 s2, exec_lo
3412; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3413; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3414; GFX11-NEXT:    v_max_f16_e32 v4, v4, v4
3415; GFX11-NEXT:    v_min_f16_e32 v4, v4, v10
3416; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3417; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
3418; GFX11-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
3419; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3420; GFX11-NEXT:    v_and_or_b32 v5, v6, v9, v4
3421; GFX11-NEXT:    v_mov_b32_e32 v4, v5
3422; GFX11-NEXT:    v_mov_b32_e32 v5, v6
3423; GFX11-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
3424; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
3425; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
3426; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
3427; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
3428; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
3429; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3430; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
3431; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
3432; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3433; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
3434; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
3435; GFX11-NEXT:    s_waitcnt vmcnt(0)
3436; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
3437; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
3438; GFX11-NEXT:    s_cbranch_execnz .LBB12_4
3439; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
3440; GFX11-NEXT:    s_mov_b32 exec_lo, s2
3441; GFX11-NEXT:    s_waitcnt vmcnt(0)
3442; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
3443; GFX11-NEXT:    v_mov_b32_e32 v6, v4
3444; GFX11-NEXT:    buffer_gl1_inv
3445; GFX11-NEXT:    buffer_gl0_inv
3446; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
3447; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3448; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
3449; GFX11-NEXT:    s_cbranch_execnz .LBB12_3
3450; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
3451; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
3452; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
3453; GFX11-NEXT:    s_setpc_b64 s[30:31]
3454;
3455; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
3456; GFX10:       ; %bb.0:
3457; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3458; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
3459; GFX10-NEXT:    s_mov_b32 s5, 0
3460; GFX10-NEXT:    s_mov_b32 s6, exec_lo
3461; GFX10-NEXT:    v_and_b32_e32 v6, 3, v4
3462; GFX10-NEXT:    v_and_b32_e32 v8, -4, v4
3463; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
3464; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
3465; GFX10-NEXT:    v_not_b32_e32 v9, v6
3466; GFX10-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
3467; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
3468; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
3469; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
3470; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
3471; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
3472; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
3473; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
3474; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
3475; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
3476; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
3477; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
3478; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
3479; GFX10-NEXT:  ; %bb.2:
3480; GFX10-NEXT:    s_mov_b32 exec_lo, s6
3481; GFX10-NEXT:    v_max_f16_e32 v10, v5, v5
3482; GFX10-NEXT:  .LBB12_3: ; %atomicrmw.start
3483; GFX10-NEXT:    ; =>This Loop Header: Depth=1
3484; GFX10-NEXT:    ; Child Loop BB12_4 Depth 2
3485; GFX10-NEXT:    s_waitcnt vmcnt(0)
3486; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
3487; GFX10-NEXT:    s_mov_b32 s6, exec_lo
3488; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3489; GFX10-NEXT:    v_max_f16_e32 v4, v4, v4
3490; GFX10-NEXT:    v_min_f16_e32 v4, v4, v10
3491; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
3492; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
3493; GFX10-NEXT:    v_mov_b32_e32 v4, v5
3494; GFX10-NEXT:    v_mov_b32_e32 v5, v6
3495; GFX10-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
3496; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
3497; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
3498; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
3499; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
3500; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
3501; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
3502; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
3503; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
3504; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
3505; GFX10-NEXT:    s_waitcnt vmcnt(0)
3506; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
3507; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
3508; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
3509; GFX10-NEXT:    s_cbranch_execnz .LBB12_4
3510; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
3511; GFX10-NEXT:    s_mov_b32 exec_lo, s6
3512; GFX10-NEXT:    s_waitcnt vmcnt(0)
3513; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
3514; GFX10-NEXT:    v_mov_b32_e32 v6, v4
3515; GFX10-NEXT:    buffer_gl1_inv
3516; GFX10-NEXT:    buffer_gl0_inv
3517; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
3518; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
3519; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
3520; GFX10-NEXT:    s_cbranch_execnz .LBB12_3
3521; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
3522; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
3523; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
3524; GFX10-NEXT:    s_setpc_b64 s[30:31]
3525;
3526; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
3527; GFX90A:       ; %bb.0:
3528; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3529; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
3530; GFX90A-NEXT:    v_and_b32_e32 v9, -4, v4
3531; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
3532; GFX90A-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
3533; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
3534; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
3535; GFX90A-NEXT:    v_not_b32_e32 v10, v4
3536; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
3537; GFX90A-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
3538; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
3539; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
3540; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
3541; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
3542; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
3543; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
3544; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
3545; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
3546; GFX90A-NEXT:    s_nop 0
3547; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
3548; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
3549; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
3550; GFX90A-NEXT:  ; %bb.2:
3551; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
3552; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
3553; GFX90A-NEXT:    v_max_f16_e32 v11, v5, v5
3554; GFX90A-NEXT:  .LBB12_3: ; %atomicrmw.start
3555; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
3556; GFX90A-NEXT:    ; Child Loop BB12_4 Depth 2
3557; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3558; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
3559; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
3560; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v11
3561; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v8, v4
3562; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
3563; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
3564; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
3565; GFX90A-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
3566; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
3567; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
3568; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
3569; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
3570; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
3571; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
3572; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
3573; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
3574; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
3575; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3576; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
3577; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
3578; GFX90A-NEXT:    s_cbranch_execnz .LBB12_4
3579; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
3580; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
3581; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3582; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
3583; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3584; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
3585; GFX90A-NEXT:    buffer_wbinvl1
3586; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3587; GFX90A-NEXT:    s_cbranch_execnz .LBB12_3
3588; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
3589; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
3590; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
3591; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3592;
3593; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
3594; GFX908:       ; %bb.0:
3595; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3596; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
3597; GFX908-NEXT:    v_and_b32_e32 v8, -4, v4
3598; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
3599; GFX908-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
3600; GFX908-NEXT:    s_mov_b32 s4, 0xffff
3601; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
3602; GFX908-NEXT:    v_not_b32_e32 v9, v4
3603; GFX908-NEXT:    s_mov_b64 s[6:7], exec
3604; GFX908-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
3605; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
3606; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
3607; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
3608; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
3609; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
3610; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
3611; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
3612; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
3613; GFX908-NEXT:    s_nop 0
3614; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
3615; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
3616; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
3617; GFX908-NEXT:  ; %bb.2:
3618; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
3619; GFX908-NEXT:    s_mov_b64 s[6:7], 0
3620; GFX908-NEXT:    v_max_f16_e32 v10, v5, v5
3621; GFX908-NEXT:  .LBB12_3: ; %atomicrmw.start
3622; GFX908-NEXT:    ; =>This Loop Header: Depth=1
3623; GFX908-NEXT:    ; Child Loop BB12_4 Depth 2
3624; GFX908-NEXT:    s_waitcnt vmcnt(0)
3625; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
3626; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
3627; GFX908-NEXT:    v_min_f16_e32 v4, v4, v10
3628; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
3629; GFX908-NEXT:    v_and_or_b32 v5, v6, v9, v4
3630; GFX908-NEXT:    v_mov_b32_e32 v4, v5
3631; GFX908-NEXT:    s_mov_b64 s[12:13], exec
3632; GFX908-NEXT:    v_mov_b32_e32 v5, v6
3633; GFX908-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
3634; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
3635; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
3636; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
3637; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
3638; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
3639; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
3640; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
3641; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
3642; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
3643; GFX908-NEXT:    s_waitcnt vmcnt(0)
3644; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
3645; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
3646; GFX908-NEXT:    s_cbranch_execnz .LBB12_4
3647; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
3648; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
3649; GFX908-NEXT:    s_waitcnt vmcnt(0)
3650; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
3651; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3652; GFX908-NEXT:    v_mov_b32_e32 v6, v4
3653; GFX908-NEXT:    buffer_wbinvl1
3654; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3655; GFX908-NEXT:    s_cbranch_execnz .LBB12_3
3656; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
3657; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
3658; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
3659; GFX908-NEXT:    s_setpc_b64 s[30:31]
3660;
3661; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
3662; GFX8:       ; %bb.0:
3663; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3664; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
3665; GFX8-NEXT:    v_and_b32_e32 v8, -4, v4
3666; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
3667; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
3668; GFX8-NEXT:    s_mov_b32 s4, 0xffff
3669; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
3670; GFX8-NEXT:    v_not_b32_e32 v9, v4
3671; GFX8-NEXT:    s_mov_b64 s[6:7], exec
3672; GFX8-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
3673; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
3674; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
3675; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
3676; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
3677; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
3678; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
3679; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
3680; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
3681; GFX8-NEXT:    s_nop 0
3682; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
3683; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
3684; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
3685; GFX8-NEXT:  ; %bb.2:
3686; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
3687; GFX8-NEXT:    s_mov_b64 s[6:7], 0
3688; GFX8-NEXT:    v_max_f16_e32 v10, v5, v5
3689; GFX8-NEXT:  .LBB12_3: ; %atomicrmw.start
3690; GFX8-NEXT:    ; =>This Loop Header: Depth=1
3691; GFX8-NEXT:    ; Child Loop BB12_4 Depth 2
3692; GFX8-NEXT:    s_waitcnt vmcnt(0)
3693; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
3694; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
3695; GFX8-NEXT:    v_min_f16_e32 v4, v4, v10
3696; GFX8-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
3697; GFX8-NEXT:    v_and_b32_e32 v5, v6, v9
3698; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
3699; GFX8-NEXT:    v_mov_b32_e32 v4, v5
3700; GFX8-NEXT:    s_mov_b64 s[12:13], exec
3701; GFX8-NEXT:    v_mov_b32_e32 v5, v6
3702; GFX8-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
3703; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
3704; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
3705; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
3706; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
3707; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
3708; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
3709; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
3710; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
3711; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
3712; GFX8-NEXT:    s_waitcnt vmcnt(0)
3713; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
3714; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
3715; GFX8-NEXT:    s_cbranch_execnz .LBB12_4
3716; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
3717; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
3718; GFX8-NEXT:    s_waitcnt vmcnt(0)
3719; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
3720; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3721; GFX8-NEXT:    v_mov_b32_e32 v6, v4
3722; GFX8-NEXT:    buffer_wbinvl1
3723; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3724; GFX8-NEXT:    s_cbranch_execnz .LBB12_3
3725; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
3726; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
3727; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
3728; GFX8-NEXT:    s_setpc_b64 s[30:31]
3729;
3730; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
3731; GFX7:       ; %bb.0:
3732; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3733; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
3734; GFX7-NEXT:    v_and_b32_e32 v8, -4, v4
3735; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
3736; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
3737; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
3738; GFX7-NEXT:    v_not_b32_e32 v9, v4
3739; GFX7-NEXT:    s_mov_b64 s[6:7], exec
3740; GFX7-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
3741; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
3742; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
3743; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
3744; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
3745; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
3746; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
3747; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
3748; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
3749; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
3750; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
3751; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
3752; GFX7-NEXT:  ; %bb.2:
3753; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
3754; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v5
3755; GFX7-NEXT:    s_mov_b64 s[6:7], 0
3756; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v4
3757; GFX7-NEXT:  .LBB12_3: ; %atomicrmw.start
3758; GFX7-NEXT:    ; =>This Loop Header: Depth=1
3759; GFX7-NEXT:    ; Child Loop BB12_4 Depth 2
3760; GFX7-NEXT:    s_waitcnt vmcnt(0)
3761; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
3762; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
3763; GFX7-NEXT:    v_and_b32_e32 v5, v6, v9
3764; GFX7-NEXT:    s_mov_b64 s[12:13], exec
3765; GFX7-NEXT:    v_min_f32_e32 v4, v4, v10
3766; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
3767; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
3768; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
3769; GFX7-NEXT:    v_mov_b32_e32 v4, v5
3770; GFX7-NEXT:    v_mov_b32_e32 v5, v6
3771; GFX7-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
3772; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
3773; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
3774; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
3775; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
3776; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
3777; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
3778; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
3779; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
3780; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
3781; GFX7-NEXT:    s_waitcnt vmcnt(0)
3782; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
3783; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
3784; GFX7-NEXT:    s_cbranch_execnz .LBB12_4
3785; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
3786; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
3787; GFX7-NEXT:    s_waitcnt vmcnt(0)
3788; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
3789; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3790; GFX7-NEXT:    v_mov_b32_e32 v6, v4
3791; GFX7-NEXT:    buffer_wbinvl1
3792; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3793; GFX7-NEXT:    s_cbranch_execnz .LBB12_3
3794; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
3795; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
3796; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
3797; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
3798; GFX7-NEXT:    s_setpc_b64 s[30:31]
3799;
3800; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
3801; GFX6:       ; %bb.0:
3802; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3803; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
3804; GFX6-NEXT:    v_and_b32_e32 v8, -4, v4
3805; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
3806; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
3807; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
3808; GFX6-NEXT:    v_not_b32_e32 v9, v4
3809; GFX6-NEXT:    s_mov_b64 s[6:7], exec
3810; GFX6-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
3811; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
3812; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
3813; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
3814; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
3815; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
3816; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
3817; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
3818; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
3819; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
3820; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
3821; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
3822; GFX6-NEXT:  ; %bb.2:
3823; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
3824; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v5
3825; GFX6-NEXT:    s_mov_b64 s[6:7], 0
3826; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v4
3827; GFX6-NEXT:  .LBB12_3: ; %atomicrmw.start
3828; GFX6-NEXT:    ; =>This Loop Header: Depth=1
3829; GFX6-NEXT:    ; Child Loop BB12_4 Depth 2
3830; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3831; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
3832; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
3833; GFX6-NEXT:    v_and_b32_e32 v5, v6, v9
3834; GFX6-NEXT:    s_mov_b64 s[12:13], exec
3835; GFX6-NEXT:    v_min_f32_e32 v4, v4, v10
3836; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
3837; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
3838; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
3839; GFX6-NEXT:    v_mov_b32_e32 v4, v5
3840; GFX6-NEXT:    v_mov_b32_e32 v5, v6
3841; GFX6-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
3842; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
3843; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
3844; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
3845; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
3846; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
3847; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
3848; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
3849; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
3850; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
3851; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3852; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
3853; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
3854; GFX6-NEXT:    s_cbranch_execnz .LBB12_4
3855; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
3856; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
3857; GFX6-NEXT:    s_waitcnt vmcnt(0)
3858; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
3859; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3860; GFX6-NEXT:    v_mov_b32_e32 v6, v4
3861; GFX6-NEXT:    buffer_wbinvl1
3862; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3863; GFX6-NEXT:    s_cbranch_execnz .LBB12_3
3864; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
3865; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
3866; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
3867; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
3868; GFX6-NEXT:    s_waitcnt expcnt(0)
3869; GFX6-NEXT:    s_setpc_b64 s[30:31]
3870  %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
3871  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
3872  ret half %result
3873}
3874
3875; --------------------------------------------------------------------
3876; bfloat
3877; --------------------------------------------------------------------
3878
3879define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
3880; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
3881; GFX12:       ; %bb.0:
3882; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3883; GFX12-NEXT:    s_wait_expcnt 0x0
3884; GFX12-NEXT:    s_wait_samplecnt 0x0
3885; GFX12-NEXT:    s_wait_bvhcnt 0x0
3886; GFX12-NEXT:    s_wait_kmcnt 0x0
3887; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
3888; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
3889; GFX12-NEXT:    s_wait_alu 0xfffe
3890; GFX12-NEXT:    s_and_b32 s4, s16, -4
3891; GFX12-NEXT:    s_wait_alu 0xfffe
3892; GFX12-NEXT:    v_mov_b32_e32 v4, s4
3893; GFX12-NEXT:    s_and_b32 s4, s16, 3
3894; GFX12-NEXT:    s_wait_alu 0xfffe
3895; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
3896; GFX12-NEXT:    s_wait_alu 0xfffe
3897; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
3898; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
3899; GFX12-NEXT:    s_wait_alu 0xfffe
3900; GFX12-NEXT:    s_not_b32 s6, s5
3901; GFX12-NEXT:    s_mov_b32 s5, 0
3902; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
3903; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3904; GFX12-NEXT:    s_wait_loadcnt 0x0
3905; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
3906; GFX12-NEXT:    s_wait_storecnt 0x0
3907; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3908; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3909; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v5
3910; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3911; GFX12-NEXT:    v_bfe_u32 v2, v0, 16, 1
3912; GFX12-NEXT:    v_or_b32_e32 v3, 0x400000, v0
3913; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
3914; GFX12-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
3915; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3916; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
3917; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
3918; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3919; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
3920; GFX12-NEXT:    s_wait_alu 0xfffe
3921; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
3922; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3923; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
3924; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
3925; GFX12-NEXT:    s_wait_loadcnt 0x0
3926; GFX12-NEXT:    global_inv scope:SCOPE_DEV
3927; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
3928; GFX12-NEXT:    v_mov_b32_e32 v1, v2
3929; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
3930; GFX12-NEXT:    s_wait_alu 0xfffe
3931; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
3932; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
3933; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
3934; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
3935; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
3936; GFX12-NEXT:    s_wait_alu 0xfffe
3937; GFX12-NEXT:    s_setpc_b64 s[30:31]
3938;
3939; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
3940; GFX940:       ; %bb.0:
3941; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3942; GFX940-NEXT:    s_addk_i32 s16, 0x200
3943; GFX940-NEXT:    s_and_b32 s4, s16, -4
3944; GFX940-NEXT:    v_mov_b32_e32 v4, s4
3945; GFX940-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
3946; GFX940-NEXT:    s_and_b32 s4, s16, 3
3947; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
3948; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
3949; GFX940-NEXT:    s_not_b32 s7, s4
3950; GFX940-NEXT:    s_mov_b64 s[4:5], 0
3951; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
3952; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
3953; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
3954; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
3955; GFX940-NEXT:    s_waitcnt vmcnt(0)
3956; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3957; GFX940-NEXT:    buffer_wbl2 sc1
3958; GFX940-NEXT:    v_min_f32_e32 v0, v0, v5
3959; GFX940-NEXT:    v_bfe_u32 v2, v0, 16, 1
3960; GFX940-NEXT:    v_or_b32_e32 v3, 0x400000, v0
3961; GFX940-NEXT:    v_add3_u32 v2, v2, v0, s8
3962; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
3963; GFX940-NEXT:    s_nop 1
3964; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
3965; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3966; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
3967; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
3968; GFX940-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
3969; GFX940-NEXT:    s_waitcnt vmcnt(0)
3970; GFX940-NEXT:    buffer_inv sc1
3971; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
3972; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3973; GFX940-NEXT:    v_mov_b32_e32 v1, v2
3974; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3975; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
3976; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
3977; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
3978; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
3979; GFX940-NEXT:    s_setpc_b64 s[30:31]
3980;
3981; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
3982; GFX11:       ; %bb.0:
3983; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3984; GFX11-NEXT:    s_addk_i32 s16, 0x200
3985; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
3986; GFX11-NEXT:    s_and_b32 s4, s16, -4
3987; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3988; GFX11-NEXT:    v_mov_b32_e32 v4, s4
3989; GFX11-NEXT:    s_and_b32 s4, s16, 3
3990; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
3991; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3992; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
3993; GFX11-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
3994; GFX11-NEXT:    s_not_b32 s6, s5
3995; GFX11-NEXT:    s_mov_b32 s5, 0
3996; GFX11-NEXT:    .p2align 6
3997; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
3998; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3999; GFX11-NEXT:    s_waitcnt vmcnt(0)
4000; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
4001; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4002; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4003; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
4004; GFX11-NEXT:    v_min_f32_e32 v0, v0, v5
4005; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
4006; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
4007; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
4008; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
4009; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
4010; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4011; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
4012; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4013; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4014; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
4015; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
4016; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4017; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
4018; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
4019; GFX11-NEXT:    s_waitcnt vmcnt(0)
4020; GFX11-NEXT:    buffer_gl1_inv
4021; GFX11-NEXT:    buffer_gl0_inv
4022; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
4023; GFX11-NEXT:    v_mov_b32_e32 v1, v2
4024; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
4025; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4026; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
4027; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
4028; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
4029; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
4030; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
4031; GFX11-NEXT:    s_setpc_b64 s[30:31]
4032;
4033; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
4034; GFX10:       ; %bb.0:
4035; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4036; GFX10-NEXT:    s_addk_i32 s20, 0x200
4037; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
4038; GFX10-NEXT:    s_and_b32 s4, s20, -4
4039; GFX10-NEXT:    v_mov_b32_e32 v4, s4
4040; GFX10-NEXT:    s_and_b32 s4, s20, 3
4041; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
4042; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
4043; GFX10-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
4044; GFX10-NEXT:    s_not_b32 s6, s5
4045; GFX10-NEXT:    s_mov_b32 s5, 0
4046; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
4047; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
4048; GFX10-NEXT:    s_waitcnt vmcnt(0)
4049; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4050; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4051; GFX10-NEXT:    v_min_f32_e32 v0, v0, v5
4052; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
4053; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
4054; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
4055; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
4056; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
4057; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4058; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
4059; GFX10-NEXT:    v_mov_b32_e32 v3, v1
4060; GFX10-NEXT:    v_mov_b32_e32 v2, v0
4061; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
4062; GFX10-NEXT:    s_waitcnt vmcnt(0)
4063; GFX10-NEXT:    buffer_gl1_inv
4064; GFX10-NEXT:    buffer_gl0_inv
4065; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
4066; GFX10-NEXT:    v_mov_b32_e32 v1, v2
4067; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
4068; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
4069; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
4070; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
4071; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
4072; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
4073; GFX10-NEXT:    s_setpc_b64 s[30:31]
4074;
4075; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
4076; GFX90A:       ; %bb.0:
4077; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4078; GFX90A-NEXT:    s_addk_i32 s20, 0x200
4079; GFX90A-NEXT:    s_and_b32 s4, s20, -4
4080; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
4081; GFX90A-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
4082; GFX90A-NEXT:    s_and_b32 s4, s20, 3
4083; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
4084; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
4085; GFX90A-NEXT:    s_not_b32 s7, s4
4086; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
4087; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
4088; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
4089; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
4090; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
4091; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4092; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4093; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v5
4094; GFX90A-NEXT:    v_bfe_u32 v2, v0, 16, 1
4095; GFX90A-NEXT:    v_or_b32_e32 v3, 0x400000, v0
4096; GFX90A-NEXT:    v_add3_u32 v2, v2, v0, s8
4097; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
4098; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
4099; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4100; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
4101; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
4102; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
4103; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4104; GFX90A-NEXT:    buffer_wbinvl1
4105; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
4106; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4107; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
4108; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4109; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
4110; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
4111; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4112; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
4113; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4114;
4115; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
4116; GFX908:       ; %bb.0:
4117; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4118; GFX908-NEXT:    s_addk_i32 s20, 0x200
4119; GFX908-NEXT:    s_and_b32 s4, s20, -4
4120; GFX908-NEXT:    v_mov_b32_e32 v4, s4
4121; GFX908-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
4122; GFX908-NEXT:    s_and_b32 s4, s20, 3
4123; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
4124; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
4125; GFX908-NEXT:    s_not_b32 s7, s4
4126; GFX908-NEXT:    s_mov_b64 s[4:5], 0
4127; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
4128; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
4129; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
4130; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
4131; GFX908-NEXT:    s_waitcnt vmcnt(0)
4132; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4133; GFX908-NEXT:    v_min_f32_e32 v0, v0, v5
4134; GFX908-NEXT:    v_bfe_u32 v2, v0, 16, 1
4135; GFX908-NEXT:    v_or_b32_e32 v3, 0x400000, v0
4136; GFX908-NEXT:    v_add3_u32 v2, v2, v0, s8
4137; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
4138; GFX908-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
4139; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4140; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
4141; GFX908-NEXT:    v_mov_b32_e32 v3, v1
4142; GFX908-NEXT:    v_mov_b32_e32 v2, v0
4143; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
4144; GFX908-NEXT:    s_waitcnt vmcnt(0)
4145; GFX908-NEXT:    buffer_wbinvl1
4146; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
4147; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4148; GFX908-NEXT:    v_mov_b32_e32 v1, v2
4149; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4150; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
4151; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
4152; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4153; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
4154; GFX908-NEXT:    s_setpc_b64 s[30:31]
4155;
4156; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
4157; GFX8:       ; %bb.0:
4158; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4159; GFX8-NEXT:    s_addk_i32 s20, 0x200
4160; GFX8-NEXT:    s_and_b32 s4, s20, -4
4161; GFX8-NEXT:    v_mov_b32_e32 v4, s4
4162; GFX8-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
4163; GFX8-NEXT:    s_and_b32 s4, s20, 3
4164; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
4165; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
4166; GFX8-NEXT:    s_not_b32 s7, s4
4167; GFX8-NEXT:    s_mov_b64 s[4:5], 0
4168; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
4169; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
4170; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
4171; GFX8-NEXT:    v_mov_b32_e32 v0, s6
4172; GFX8-NEXT:    s_waitcnt vmcnt(0)
4173; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4174; GFX8-NEXT:    v_min_f32_e32 v3, v3, v5
4175; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
4176; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
4177; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
4178; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
4179; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
4180; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
4181; GFX8-NEXT:    v_and_b32_e32 v2, s7, v1
4182; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4183; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
4184; GFX8-NEXT:    v_mov_b32_e32 v3, v1
4185; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4186; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
4187; GFX8-NEXT:    s_waitcnt vmcnt(0)
4188; GFX8-NEXT:    buffer_wbinvl1
4189; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
4190; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4191; GFX8-NEXT:    v_mov_b32_e32 v1, v2
4192; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4193; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
4194; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
4195; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4196; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
4197; GFX8-NEXT:    s_setpc_b64 s[30:31]
4198;
4199; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
4200; GFX7:       ; %bb.0:
4201; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4202; GFX7-NEXT:    s_addk_i32 s20, 0x200
4203; GFX7-NEXT:    s_and_b32 s4, s20, -4
4204; GFX7-NEXT:    v_mov_b32_e32 v4, s4
4205; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
4206; GFX7-NEXT:    s_and_b32 s4, s20, 3
4207; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
4208; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
4209; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4210; GFX7-NEXT:    s_not_b32 s7, s4
4211; GFX7-NEXT:    s_mov_b64 s[4:5], 0
4212; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
4213; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
4214; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
4215; GFX7-NEXT:    s_waitcnt vmcnt(0)
4216; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
4217; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
4218; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4219; GFX7-NEXT:    v_min_f32_e32 v0, v0, v5
4220; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4221; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
4222; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
4223; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
4224; GFX7-NEXT:    v_mov_b32_e32 v3, v1
4225; GFX7-NEXT:    v_mov_b32_e32 v2, v0
4226; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
4227; GFX7-NEXT:    s_waitcnt vmcnt(0)
4228; GFX7-NEXT:    buffer_wbinvl1
4229; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
4230; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4231; GFX7-NEXT:    v_mov_b32_e32 v1, v2
4232; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4233; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
4234; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
4235; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4236; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
4237; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
4238; GFX7-NEXT:    s_setpc_b64 s[30:31]
4239;
4240; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
4241; GFX6:       ; %bb.0:
4242; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4243; GFX6-NEXT:    s_addk_i32 s20, 0x200
4244; GFX6-NEXT:    s_and_b32 s4, s20, -4
4245; GFX6-NEXT:    v_mov_b32_e32 v4, s4
4246; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
4247; GFX6-NEXT:    s_and_b32 s4, s20, 3
4248; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
4249; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
4250; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4251; GFX6-NEXT:    s_not_b32 s7, s4
4252; GFX6-NEXT:    s_mov_b64 s[4:5], 0
4253; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
4254; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
4255; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
4256; GFX6-NEXT:    s_waitcnt vmcnt(0)
4257; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
4258; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
4259; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4260; GFX6-NEXT:    v_min_f32_e32 v0, v0, v5
4261; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4262; GFX6-NEXT:    s_waitcnt expcnt(0)
4263; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
4264; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
4265; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
4266; GFX6-NEXT:    v_mov_b32_e32 v3, v1
4267; GFX6-NEXT:    v_mov_b32_e32 v2, v0
4268; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
4269; GFX6-NEXT:    s_waitcnt vmcnt(0)
4270; GFX6-NEXT:    buffer_wbinvl1
4271; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
4272; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4273; GFX6-NEXT:    v_mov_b32_e32 v1, v2
4274; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4275; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
4276; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
4277; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
4278; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
4279; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
4280; GFX6-NEXT:    s_waitcnt expcnt(0)
4281; GFX6-NEXT:    s_setpc_b64 s[30:31]
4282  %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
4283  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
4284  ret bfloat %result
4285}
4286
4287define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
4288; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
4289; GFX12:       ; %bb.0:
4290; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4291; GFX12-NEXT:    s_wait_expcnt 0x0
4292; GFX12-NEXT:    s_wait_samplecnt 0x0
4293; GFX12-NEXT:    s_wait_bvhcnt 0x0
4294; GFX12-NEXT:    s_wait_kmcnt 0x0
4295; GFX12-NEXT:    s_addk_co_i32 s16, 0x200
4296; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
4297; GFX12-NEXT:    s_wait_alu 0xfffe
4298; GFX12-NEXT:    s_and_b32 s4, s16, -4
4299; GFX12-NEXT:    s_wait_alu 0xfffe
4300; GFX12-NEXT:    v_mov_b32_e32 v2, s4
4301; GFX12-NEXT:    s_and_b32 s4, s16, 3
4302; GFX12-NEXT:    s_wait_alu 0xfffe
4303; GFX12-NEXT:    s_lshl_b32 s4, s4, 3
4304; GFX12-NEXT:    s_wait_alu 0xfffe
4305; GFX12-NEXT:    s_lshl_b32 s5, 0xffff, s4
4306; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
4307; GFX12-NEXT:    s_wait_alu 0xfffe
4308; GFX12-NEXT:    s_not_b32 s6, s5
4309; GFX12-NEXT:    s_mov_b32 s5, 0
4310; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
4311; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
4312; GFX12-NEXT:    s_wait_loadcnt 0x0
4313; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
4314; GFX12-NEXT:    s_wait_storecnt 0x0
4315; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4316; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
4317; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v3
4318; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
4319; GFX12-NEXT:    v_bfe_u32 v4, v0, 16, 1
4320; GFX12-NEXT:    v_or_b32_e32 v5, 0x400000, v0
4321; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
4322; GFX12-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
4323; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4324; GFX12-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
4325; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4326; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
4327; GFX12-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
4328; GFX12-NEXT:    s_wait_alu 0xfffe
4329; GFX12-NEXT:    v_and_or_b32 v0, v1, s6, v0
4330; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4331; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
4332; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
4333; GFX12-NEXT:    s_wait_loadcnt 0x0
4334; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4335; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
4336; GFX12-NEXT:    v_mov_b32_e32 v1, v4
4337; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
4338; GFX12-NEXT:    s_wait_alu 0xfffe
4339; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
4340; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
4341; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
4342; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
4343; GFX12-NEXT:    s_wait_alu 0xfffe
4344; GFX12-NEXT:    s_setpc_b64 s[30:31]
4345;
4346; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
4347; GFX940:       ; %bb.0:
4348; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4349; GFX940-NEXT:    s_addk_i32 s16, 0x200
4350; GFX940-NEXT:    s_and_b32 s4, s16, -4
4351; GFX940-NEXT:    v_mov_b32_e32 v2, s4
4352; GFX940-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
4353; GFX940-NEXT:    s_and_b32 s4, s16, 3
4354; GFX940-NEXT:    s_lshl_b32 s6, s4, 3
4355; GFX940-NEXT:    s_lshl_b32 s4, 0xffff, s6
4356; GFX940-NEXT:    s_not_b32 s7, s4
4357; GFX940-NEXT:    s_mov_b64 s[4:5], 0
4358; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
4359; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
4360; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
4361; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
4362; GFX940-NEXT:    s_waitcnt vmcnt(0)
4363; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4364; GFX940-NEXT:    buffer_wbl2 sc1
4365; GFX940-NEXT:    v_min_f32_e32 v0, v0, v3
4366; GFX940-NEXT:    v_bfe_u32 v4, v0, 16, 1
4367; GFX940-NEXT:    v_or_b32_e32 v5, 0x400000, v0
4368; GFX940-NEXT:    v_add3_u32 v4, v4, v0, s8
4369; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
4370; GFX940-NEXT:    s_nop 1
4371; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
4372; GFX940-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4373; GFX940-NEXT:    v_and_or_b32 v0, v1, s7, v0
4374; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
4375; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
4376; GFX940-NEXT:    s_waitcnt vmcnt(0)
4377; GFX940-NEXT:    buffer_inv sc1
4378; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
4379; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4380; GFX940-NEXT:    v_mov_b32_e32 v1, v4
4381; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4382; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
4383; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
4384; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
4385; GFX940-NEXT:    s_setpc_b64 s[30:31]
4386;
4387; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
4388; GFX11:       ; %bb.0:
4389; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4390; GFX11-NEXT:    s_addk_i32 s16, 0x200
4391; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
4392; GFX11-NEXT:    s_and_b32 s4, s16, -4
4393; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4394; GFX11-NEXT:    v_mov_b32_e32 v2, s4
4395; GFX11-NEXT:    s_and_b32 s4, s16, 3
4396; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
4397; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4398; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
4399; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
4400; GFX11-NEXT:    s_not_b32 s6, s5
4401; GFX11-NEXT:    s_mov_b32 s5, 0
4402; GFX11-NEXT:    .p2align 6
4403; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
4404; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
4405; GFX11-NEXT:    s_waitcnt vmcnt(0)
4406; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
4407; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4408; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4409; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
4410; GFX11-NEXT:    v_min_f32_e32 v0, v0, v3
4411; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
4412; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
4413; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
4414; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
4415; GFX11-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
4416; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4417; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
4418; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4419; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4420; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
4421; GFX11-NEXT:    v_and_or_b32 v0, v1, s6, v0
4422; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4423; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
4424; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
4425; GFX11-NEXT:    s_waitcnt vmcnt(0)
4426; GFX11-NEXT:    buffer_gl1_inv
4427; GFX11-NEXT:    buffer_gl0_inv
4428; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
4429; GFX11-NEXT:    v_mov_b32_e32 v1, v4
4430; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
4431; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4432; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
4433; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
4434; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
4435; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
4436; GFX11-NEXT:    s_setpc_b64 s[30:31]
4437;
4438; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
4439; GFX10:       ; %bb.0:
4440; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4441; GFX10-NEXT:    s_addk_i32 s20, 0x200
4442; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
4443; GFX10-NEXT:    s_and_b32 s4, s20, -4
4444; GFX10-NEXT:    v_mov_b32_e32 v2, s4
4445; GFX10-NEXT:    s_and_b32 s4, s20, 3
4446; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
4447; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
4448; GFX10-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
4449; GFX10-NEXT:    s_not_b32 s6, s5
4450; GFX10-NEXT:    s_mov_b32 s5, 0
4451; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
4452; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
4453; GFX10-NEXT:    s_waitcnt vmcnt(0)
4454; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4455; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4456; GFX10-NEXT:    v_min_f32_e32 v0, v0, v3
4457; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
4458; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
4459; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
4460; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
4461; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
4462; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4463; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
4464; GFX10-NEXT:    v_mov_b32_e32 v5, v1
4465; GFX10-NEXT:    v_mov_b32_e32 v4, v0
4466; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
4467; GFX10-NEXT:    s_waitcnt vmcnt(0)
4468; GFX10-NEXT:    buffer_gl1_inv
4469; GFX10-NEXT:    buffer_gl0_inv
4470; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
4471; GFX10-NEXT:    v_mov_b32_e32 v1, v4
4472; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
4473; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
4474; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
4475; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
4476; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
4477; GFX10-NEXT:    s_setpc_b64 s[30:31]
4478;
4479; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
4480; GFX90A:       ; %bb.0:
4481; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4482; GFX90A-NEXT:    s_addk_i32 s20, 0x200
4483; GFX90A-NEXT:    s_and_b32 s4, s20, -4
4484; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
4485; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
4486; GFX90A-NEXT:    s_and_b32 s4, s20, 3
4487; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
4488; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
4489; GFX90A-NEXT:    s_not_b32 s7, s4
4490; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
4491; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
4492; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
4493; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
4494; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
4495; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4496; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4497; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v3
4498; GFX90A-NEXT:    v_bfe_u32 v4, v0, 16, 1
4499; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v0
4500; GFX90A-NEXT:    v_add3_u32 v4, v4, v0, s8
4501; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
4502; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
4503; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4504; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
4505; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
4506; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
4507; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4508; GFX90A-NEXT:    buffer_wbinvl1
4509; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
4510; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4511; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
4512; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4513; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
4514; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
4515; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4516; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4517;
4518; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
4519; GFX908:       ; %bb.0:
4520; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4521; GFX908-NEXT:    s_addk_i32 s20, 0x200
4522; GFX908-NEXT:    s_and_b32 s4, s20, -4
4523; GFX908-NEXT:    v_mov_b32_e32 v2, s4
4524; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
4525; GFX908-NEXT:    s_and_b32 s4, s20, 3
4526; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
4527; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
4528; GFX908-NEXT:    s_not_b32 s7, s4
4529; GFX908-NEXT:    s_mov_b64 s[4:5], 0
4530; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
4531; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
4532; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
4533; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
4534; GFX908-NEXT:    s_waitcnt vmcnt(0)
4535; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4536; GFX908-NEXT:    v_min_f32_e32 v0, v0, v3
4537; GFX908-NEXT:    v_bfe_u32 v4, v0, 16, 1
4538; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v0
4539; GFX908-NEXT:    v_add3_u32 v4, v4, v0, s8
4540; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
4541; GFX908-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
4542; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4543; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
4544; GFX908-NEXT:    v_mov_b32_e32 v5, v1
4545; GFX908-NEXT:    v_mov_b32_e32 v4, v0
4546; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
4547; GFX908-NEXT:    s_waitcnt vmcnt(0)
4548; GFX908-NEXT:    buffer_wbinvl1
4549; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
4550; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4551; GFX908-NEXT:    v_mov_b32_e32 v1, v4
4552; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4553; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
4554; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
4555; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4556; GFX908-NEXT:    s_setpc_b64 s[30:31]
4557;
4558; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
4559; GFX8:       ; %bb.0:
4560; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4561; GFX8-NEXT:    s_addk_i32 s20, 0x200
4562; GFX8-NEXT:    s_and_b32 s4, s20, -4
4563; GFX8-NEXT:    v_mov_b32_e32 v2, s4
4564; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
4565; GFX8-NEXT:    s_and_b32 s4, s20, 3
4566; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
4567; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
4568; GFX8-NEXT:    s_not_b32 s7, s4
4569; GFX8-NEXT:    s_mov_b64 s[4:5], 0
4570; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
4571; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
4572; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
4573; GFX8-NEXT:    v_mov_b32_e32 v0, s6
4574; GFX8-NEXT:    s_waitcnt vmcnt(0)
4575; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4576; GFX8-NEXT:    v_min_f32_e32 v5, v5, v3
4577; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
4578; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
4579; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
4580; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
4581; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
4582; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
4583; GFX8-NEXT:    v_and_b32_e32 v4, s7, v1
4584; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4585; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
4586; GFX8-NEXT:    v_mov_b32_e32 v5, v1
4587; GFX8-NEXT:    v_mov_b32_e32 v4, v0
4588; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
4589; GFX8-NEXT:    s_waitcnt vmcnt(0)
4590; GFX8-NEXT:    buffer_wbinvl1
4591; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
4592; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4593; GFX8-NEXT:    v_mov_b32_e32 v1, v4
4594; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4595; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
4596; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
4597; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4598; GFX8-NEXT:    s_setpc_b64 s[30:31]
4599;
4600; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
4601; GFX7:       ; %bb.0:
4602; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4603; GFX7-NEXT:    s_addk_i32 s20, 0x200
4604; GFX7-NEXT:    s_and_b32 s4, s20, -4
4605; GFX7-NEXT:    v_mov_b32_e32 v2, s4
4606; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
4607; GFX7-NEXT:    s_and_b32 s4, s20, 3
4608; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
4609; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
4610; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4611; GFX7-NEXT:    s_not_b32 s7, s4
4612; GFX7-NEXT:    s_mov_b64 s[4:5], 0
4613; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
4614; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
4615; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
4616; GFX7-NEXT:    s_waitcnt vmcnt(0)
4617; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
4618; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
4619; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4620; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
4621; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4622; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
4623; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
4624; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
4625; GFX7-NEXT:    v_mov_b32_e32 v5, v1
4626; GFX7-NEXT:    v_mov_b32_e32 v4, v0
4627; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
4628; GFX7-NEXT:    s_waitcnt vmcnt(0)
4629; GFX7-NEXT:    buffer_wbinvl1
4630; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
4631; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4632; GFX7-NEXT:    v_mov_b32_e32 v1, v4
4633; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4634; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
4635; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
4636; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4637; GFX7-NEXT:    s_setpc_b64 s[30:31]
4638;
4639; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
4640; GFX6:       ; %bb.0:
4641; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4642; GFX6-NEXT:    s_addk_i32 s20, 0x200
4643; GFX6-NEXT:    s_and_b32 s4, s20, -4
4644; GFX6-NEXT:    v_mov_b32_e32 v2, s4
4645; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
4646; GFX6-NEXT:    s_and_b32 s4, s20, 3
4647; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
4648; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
4649; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4650; GFX6-NEXT:    s_not_b32 s7, s4
4651; GFX6-NEXT:    s_mov_b64 s[4:5], 0
4652; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
4653; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
4654; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
4655; GFX6-NEXT:    s_waitcnt vmcnt(0)
4656; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
4657; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
4658; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
4659; GFX6-NEXT:    v_min_f32_e32 v0, v0, v3
4660; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
4661; GFX6-NEXT:    s_waitcnt expcnt(0)
4662; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
4663; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
4664; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
4665; GFX6-NEXT:    v_mov_b32_e32 v5, v1
4666; GFX6-NEXT:    v_mov_b32_e32 v4, v0
4667; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
4668; GFX6-NEXT:    s_waitcnt vmcnt(0)
4669; GFX6-NEXT:    buffer_wbinvl1
4670; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
4671; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4672; GFX6-NEXT:    v_mov_b32_e32 v1, v4
4673; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4674; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
4675; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
4676; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
4677; GFX6-NEXT:    s_waitcnt expcnt(0)
4678; GFX6-NEXT:    s_setpc_b64 s[30:31]
4679  %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
4680  %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
4681  ret void
4682}
4683
4684define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
4685; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
4686; GFX12:       ; %bb.0:
4687; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4688; GFX12-NEXT:    s_wait_expcnt 0x0
4689; GFX12-NEXT:    s_wait_samplecnt 0x0
4690; GFX12-NEXT:    s_wait_bvhcnt 0x0
4691; GFX12-NEXT:    s_wait_kmcnt 0x0
4692; GFX12-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
4693; GFX12-NEXT:    s_mov_b32 s1, exec_lo
4694; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4695; GFX12-NEXT:    v_and_b32_e32 v6, 3, v4
4696; GFX12-NEXT:    v_and_b32_e32 v8, -4, v4
4697; GFX12-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
4698; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4699; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
4700; GFX12-NEXT:    v_not_b32_e32 v9, v6
4701; GFX12-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
4702; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
4703; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
4704; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
4705; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
4706; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
4707; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
4708; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
4709; GFX12-NEXT:    s_wait_alu 0xfffe
4710; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4711; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
4712; GFX12-NEXT:    s_wait_alu 0xfffe
4713; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
4714; GFX12-NEXT:    s_wait_loadcnt 0x0
4715; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
4716; GFX12-NEXT:    s_wait_alu 0xfffe
4717; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
4718; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
4719; GFX12-NEXT:  ; %bb.2:
4720; GFX12-NEXT:    s_mov_b32 exec_lo, s1
4721; GFX12-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
4722; GFX12-NEXT:    s_mov_b32 s1, 0
4723; GFX12-NEXT:  .LBB15_3: ; %atomicrmw.start
4724; GFX12-NEXT:    ; =>This Loop Header: Depth=1
4725; GFX12-NEXT:    ; Child Loop BB15_4 Depth 2
4726; GFX12-NEXT:    s_wait_loadcnt 0x0
4727; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
4728; GFX12-NEXT:    s_mov_b32 s2, exec_lo
4729; GFX12-NEXT:    s_wait_storecnt 0x0
4730; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4731; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
4732; GFX12-NEXT:    v_min_num_f32_e32 v4, v4, v10
4733; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
4734; GFX12-NEXT:    v_bfe_u32 v5, v4, 16, 1
4735; GFX12-NEXT:    v_or_b32_e32 v11, 0x400000, v4
4736; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
4737; GFX12-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
4738; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4739; GFX12-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
4740; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
4741; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4742; GFX12-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
4743; GFX12-NEXT:    v_and_or_b32 v5, v6, v9, v4
4744; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4745; GFX12-NEXT:    v_mov_b32_e32 v4, v5
4746; GFX12-NEXT:    v_mov_b32_e32 v5, v6
4747; GFX12-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
4748; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
4749; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
4750; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
4751; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
4752; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
4753; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
4754; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
4755; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
4756; GFX12-NEXT:    s_wait_alu 0xfffe
4757; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4758; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
4759; GFX12-NEXT:    s_wait_alu 0xfffe
4760; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
4761; GFX12-NEXT:    s_wait_loadcnt 0x0
4762; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
4763; GFX12-NEXT:    s_wait_alu 0xfffe
4764; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
4765; GFX12-NEXT:    s_cbranch_execnz .LBB15_4
4766; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4767; GFX12-NEXT:    s_mov_b32 exec_lo, s2
4768; GFX12-NEXT:    s_wait_loadcnt 0x0
4769; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
4770; GFX12-NEXT:    v_mov_b32_e32 v6, v4
4771; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4772; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
4773; GFX12-NEXT:    s_wait_alu 0xfffe
4774; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4775; GFX12-NEXT:    s_cbranch_execnz .LBB15_3
4776; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
4777; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4778; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
4779; GFX12-NEXT:    s_wait_alu 0xfffe
4780; GFX12-NEXT:    s_setpc_b64 s[30:31]
4781;
4782; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
4783; GFX940:       ; %bb.0:
4784; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4785; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
4786; GFX940-NEXT:    v_and_b32_e32 v9, -4, v4
4787; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
4788; GFX940-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
4789; GFX940-NEXT:    s_mov_b32 s0, 0xffff
4790; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
4791; GFX940-NEXT:    v_not_b32_e32 v10, v4
4792; GFX940-NEXT:    s_mov_b64 s[2:3], exec
4793; GFX940-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
4794; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
4795; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
4796; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
4797; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
4798; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
4799; GFX940-NEXT:    s_nop 0
4800; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
4801; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
4802; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
4803; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
4804; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
4805; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
4806; GFX940-NEXT:  ; %bb.2:
4807; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
4808; GFX940-NEXT:    s_mov_b64 s[2:3], 0
4809; GFX940-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
4810; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
4811; GFX940-NEXT:  .LBB15_3: ; %atomicrmw.start
4812; GFX940-NEXT:    ; =>This Loop Header: Depth=1
4813; GFX940-NEXT:    ; Child Loop BB15_4 Depth 2
4814; GFX940-NEXT:    s_waitcnt vmcnt(0)
4815; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4816; GFX940-NEXT:    s_mov_b64 s[8:9], exec
4817; GFX940-NEXT:    v_min_f32_e32 v4, v4, v11
4818; GFX940-NEXT:    v_bfe_u32 v5, v4, 16, 1
4819; GFX940-NEXT:    v_add3_u32 v5, v5, v4, s10
4820; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v4
4821; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
4822; GFX940-NEXT:    buffer_wbl2 sc1
4823; GFX940-NEXT:    s_nop 0
4824; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
4825; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4826; GFX940-NEXT:    v_and_or_b32 v6, v7, v10, v4
4827; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
4828; GFX940-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
4829; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
4830; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
4831; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
4832; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
4833; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
4834; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
4835; GFX940-NEXT:    s_nop 0
4836; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
4837; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
4838; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
4839; GFX940-NEXT:    s_waitcnt vmcnt(0)
4840; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
4841; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
4842; GFX940-NEXT:    s_cbranch_execnz .LBB15_4
4843; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4844; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
4845; GFX940-NEXT:    s_waitcnt vmcnt(0)
4846; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
4847; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
4848; GFX940-NEXT:    v_mov_b32_e32 v7, v4
4849; GFX940-NEXT:    buffer_inv sc1
4850; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
4851; GFX940-NEXT:    s_cbranch_execnz .LBB15_3
4852; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
4853; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
4854; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
4855; GFX940-NEXT:    s_setpc_b64 s[30:31]
4856;
4857; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
4858; GFX11:       ; %bb.0:
4859; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4860; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
4861; GFX11-NEXT:    s_mov_b32 s1, 0
4862; GFX11-NEXT:    s_mov_b32 s2, exec_lo
4863; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4864; GFX11-NEXT:    v_and_b32_e32 v6, 3, v4
4865; GFX11-NEXT:    v_and_b32_e32 v8, -4, v4
4866; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
4867; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4868; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
4869; GFX11-NEXT:    v_not_b32_e32 v9, v6
4870; GFX11-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
4871; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
4872; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
4873; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
4874; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
4875; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
4876; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
4877; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
4878; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
4879; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
4880; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
4881; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
4882; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
4883; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
4884; GFX11-NEXT:  ; %bb.2:
4885; GFX11-NEXT:    s_mov_b32 exec_lo, s2
4886; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
4887; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
4888; GFX11-NEXT:    .p2align 6
4889; GFX11-NEXT:  .LBB15_3: ; %atomicrmw.start
4890; GFX11-NEXT:    ; =>This Loop Header: Depth=1
4891; GFX11-NEXT:    ; Child Loop BB15_4 Depth 2
4892; GFX11-NEXT:    s_waitcnt vmcnt(0)
4893; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
4894; GFX11-NEXT:    s_mov_b32 s2, exec_lo
4895; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4896; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4897; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
4898; GFX11-NEXT:    v_min_f32_e32 v4, v4, v10
4899; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
4900; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
4901; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
4902; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
4903; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
4904; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4905; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
4906; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
4907; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4908; GFX11-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
4909; GFX11-NEXT:    v_and_or_b32 v5, v6, v9, v4
4910; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4911; GFX11-NEXT:    v_mov_b32_e32 v4, v5
4912; GFX11-NEXT:    v_mov_b32_e32 v5, v6
4913; GFX11-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
4914; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
4915; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
4916; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
4917; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
4918; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
4919; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
4920; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
4921; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
4922; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
4923; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
4924; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
4925; GFX11-NEXT:    s_waitcnt vmcnt(0)
4926; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
4927; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
4928; GFX11-NEXT:    s_cbranch_execnz .LBB15_4
4929; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4930; GFX11-NEXT:    s_mov_b32 exec_lo, s2
4931; GFX11-NEXT:    s_waitcnt vmcnt(0)
4932; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
4933; GFX11-NEXT:    v_mov_b32_e32 v6, v4
4934; GFX11-NEXT:    buffer_gl1_inv
4935; GFX11-NEXT:    buffer_gl0_inv
4936; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
4937; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4938; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4939; GFX11-NEXT:    s_cbranch_execnz .LBB15_3
4940; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
4941; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
4942; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4943; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
4944; GFX11-NEXT:    s_setpc_b64 s[30:31]
4945;
4946; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
4947; GFX10:       ; %bb.0:
4948; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4949; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
4950; GFX10-NEXT:    s_mov_b32 s5, 0
4951; GFX10-NEXT:    s_mov_b32 s6, exec_lo
4952; GFX10-NEXT:    v_and_b32_e32 v6, 3, v4
4953; GFX10-NEXT:    v_and_b32_e32 v8, -4, v4
4954; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
4955; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
4956; GFX10-NEXT:    v_not_b32_e32 v9, v6
4957; GFX10-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
4958; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
4959; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
4960; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
4961; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
4962; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
4963; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
4964; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
4965; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
4966; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
4967; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4968; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
4969; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
4970; GFX10-NEXT:  ; %bb.2:
4971; GFX10-NEXT:    s_mov_b32 exec_lo, s6
4972; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
4973; GFX10-NEXT:  .LBB15_3: ; %atomicrmw.start
4974; GFX10-NEXT:    ; =>This Loop Header: Depth=1
4975; GFX10-NEXT:    ; Child Loop BB15_4 Depth 2
4976; GFX10-NEXT:    s_waitcnt vmcnt(0)
4977; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4978; GFX10-NEXT:    s_mov_b32 s6, exec_lo
4979; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4980; GFX10-NEXT:    v_min_f32_e32 v4, v4, v10
4981; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
4982; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v4
4983; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
4984; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
4985; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
4986; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4987; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
4988; GFX10-NEXT:    v_mov_b32_e32 v4, v5
4989; GFX10-NEXT:    v_mov_b32_e32 v5, v6
4990; GFX10-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
4991; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
4992; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
4993; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
4994; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
4995; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
4996; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
4997; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
4998; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
4999; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
5000; GFX10-NEXT:    s_waitcnt vmcnt(0)
5001; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
5002; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
5003; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
5004; GFX10-NEXT:    s_cbranch_execnz .LBB15_4
5005; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
5006; GFX10-NEXT:    s_mov_b32 exec_lo, s6
5007; GFX10-NEXT:    s_waitcnt vmcnt(0)
5008; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
5009; GFX10-NEXT:    v_mov_b32_e32 v6, v4
5010; GFX10-NEXT:    buffer_gl1_inv
5011; GFX10-NEXT:    buffer_gl0_inv
5012; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
5013; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
5014; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
5015; GFX10-NEXT:    s_cbranch_execnz .LBB15_3
5016; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
5017; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
5018; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
5019; GFX10-NEXT:    s_setpc_b64 s[30:31]
5020;
5021; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
5022; GFX90A:       ; %bb.0:
5023; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5024; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
5025; GFX90A-NEXT:    v_and_b32_e32 v9, -4, v4
5026; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
5027; GFX90A-NEXT:    v_lshlrev_b32_e32 v8, 3, v4
5028; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
5029; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
5030; GFX90A-NEXT:    v_not_b32_e32 v10, v4
5031; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
5032; GFX90A-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
5033; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
5034; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
5035; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
5036; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
5037; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5038; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5039; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
5040; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
5041; GFX90A-NEXT:    s_nop 0
5042; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
5043; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
5044; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
5045; GFX90A-NEXT:  ; %bb.2:
5046; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
5047; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
5048; GFX90A-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
5049; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
5050; GFX90A-NEXT:  .LBB15_3: ; %atomicrmw.start
5051; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
5052; GFX90A-NEXT:    ; Child Loop BB15_4 Depth 2
5053; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5054; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5055; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v11
5056; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
5057; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
5058; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
5059; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
5060; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
5061; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5062; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
5063; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
5064; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
5065; GFX90A-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
5066; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
5067; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
5068; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
5069; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
5070; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
5071; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5072; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5073; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
5074; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
5075; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5076; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
5077; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
5078; GFX90A-NEXT:    s_cbranch_execnz .LBB15_4
5079; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
5080; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
5081; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5082; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
5083; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5084; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
5085; GFX90A-NEXT:    buffer_wbinvl1
5086; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5087; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
5088; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
5089; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
5090; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
5091; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5092;
5093; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
5094; GFX908:       ; %bb.0:
5095; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5096; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
5097; GFX908-NEXT:    v_and_b32_e32 v8, -4, v4
5098; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
5099; GFX908-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
5100; GFX908-NEXT:    s_mov_b32 s4, 0xffff
5101; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
5102; GFX908-NEXT:    v_not_b32_e32 v9, v4
5103; GFX908-NEXT:    s_mov_b64 s[6:7], exec
5104; GFX908-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
5105; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
5106; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
5107; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
5108; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
5109; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5110; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5111; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
5112; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
5113; GFX908-NEXT:    s_nop 0
5114; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
5115; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
5116; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
5117; GFX908-NEXT:  ; %bb.2:
5118; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
5119; GFX908-NEXT:    s_mov_b64 s[6:7], 0
5120; GFX908-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
5121; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
5122; GFX908-NEXT:  .LBB15_3: ; %atomicrmw.start
5123; GFX908-NEXT:    ; =>This Loop Header: Depth=1
5124; GFX908-NEXT:    ; Child Loop BB15_4 Depth 2
5125; GFX908-NEXT:    s_waitcnt vmcnt(0)
5126; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5127; GFX908-NEXT:    v_min_f32_e32 v4, v4, v10
5128; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
5129; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
5130; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v4
5131; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
5132; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc
5133; GFX908-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5134; GFX908-NEXT:    v_and_or_b32 v5, v6, v9, v4
5135; GFX908-NEXT:    v_mov_b32_e32 v4, v5
5136; GFX908-NEXT:    s_mov_b64 s[12:13], exec
5137; GFX908-NEXT:    v_mov_b32_e32 v5, v6
5138; GFX908-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
5139; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
5140; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
5141; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
5142; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
5143; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
5144; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5145; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5146; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
5147; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
5148; GFX908-NEXT:    s_waitcnt vmcnt(0)
5149; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
5150; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
5151; GFX908-NEXT:    s_cbranch_execnz .LBB15_4
5152; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
5153; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
5154; GFX908-NEXT:    s_waitcnt vmcnt(0)
5155; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
5156; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5157; GFX908-NEXT:    v_mov_b32_e32 v6, v4
5158; GFX908-NEXT:    buffer_wbinvl1
5159; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5160; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
5161; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
5162; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
5163; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
5164; GFX908-NEXT:    s_setpc_b64 s[30:31]
5165;
5166; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
5167; GFX8:       ; %bb.0:
5168; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5169; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
5170; GFX8-NEXT:    v_and_b32_e32 v8, -4, v4
5171; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
5172; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
5173; GFX8-NEXT:    s_mov_b32 s4, 0xffff
5174; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
5175; GFX8-NEXT:    v_not_b32_e32 v9, v4
5176; GFX8-NEXT:    s_mov_b64 s[6:7], exec
5177; GFX8-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
5178; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
5179; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
5180; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
5181; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
5182; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5183; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5184; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
5185; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
5186; GFX8-NEXT:    s_nop 0
5187; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
5188; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
5189; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
5190; GFX8-NEXT:  ; %bb.2:
5191; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
5192; GFX8-NEXT:    s_mov_b64 s[6:7], 0
5193; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
5194; GFX8-NEXT:  .LBB15_3: ; %atomicrmw.start
5195; GFX8-NEXT:    ; =>This Loop Header: Depth=1
5196; GFX8-NEXT:    ; Child Loop BB15_4 Depth 2
5197; GFX8-NEXT:    s_waitcnt vmcnt(0)
5198; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5199; GFX8-NEXT:    v_min_f32_e32 v4, v4, v10
5200; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
5201; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
5202; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
5203; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v4
5204; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
5205; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc
5206; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
5207; GFX8-NEXT:    v_and_b32_e32 v5, v6, v9
5208; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
5209; GFX8-NEXT:    v_mov_b32_e32 v4, v5
5210; GFX8-NEXT:    s_mov_b64 s[12:13], exec
5211; GFX8-NEXT:    v_mov_b32_e32 v5, v6
5212; GFX8-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
5213; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
5214; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
5215; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
5216; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
5217; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
5218; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5219; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5220; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
5221; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
5222; GFX8-NEXT:    s_waitcnt vmcnt(0)
5223; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
5224; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
5225; GFX8-NEXT:    s_cbranch_execnz .LBB15_4
5226; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
5227; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
5228; GFX8-NEXT:    s_waitcnt vmcnt(0)
5229; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
5230; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5231; GFX8-NEXT:    v_mov_b32_e32 v6, v4
5232; GFX8-NEXT:    buffer_wbinvl1
5233; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5234; GFX8-NEXT:    s_cbranch_execnz .LBB15_3
5235; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
5236; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
5237; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
5238; GFX8-NEXT:    s_setpc_b64 s[30:31]
5239;
5240; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
5241; GFX7:       ; %bb.0:
5242; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5243; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
5244; GFX7-NEXT:    v_and_b32_e32 v8, -4, v4
5245; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
5246; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
5247; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
5248; GFX7-NEXT:    v_not_b32_e32 v9, v4
5249; GFX7-NEXT:    s_mov_b64 s[6:7], exec
5250; GFX7-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
5251; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
5252; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
5253; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
5254; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
5255; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5256; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5257; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
5258; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
5259; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
5260; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
5261; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
5262; GFX7-NEXT:  ; %bb.2:
5263; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
5264; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
5265; GFX7-NEXT:    s_mov_b64 s[6:7], 0
5266; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
5267; GFX7-NEXT:  .LBB15_3: ; %atomicrmw.start
5268; GFX7-NEXT:    ; =>This Loop Header: Depth=1
5269; GFX7-NEXT:    ; Child Loop BB15_4 Depth 2
5270; GFX7-NEXT:    s_waitcnt vmcnt(0)
5271; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
5272; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5273; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
5274; GFX7-NEXT:    v_min_f32_e32 v4, v4, v10
5275; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
5276; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
5277; GFX7-NEXT:    v_and_b32_e32 v5, v6, v9
5278; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
5279; GFX7-NEXT:    v_mov_b32_e32 v4, v5
5280; GFX7-NEXT:    s_mov_b64 s[12:13], exec
5281; GFX7-NEXT:    v_mov_b32_e32 v5, v6
5282; GFX7-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
5283; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
5284; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
5285; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
5286; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
5287; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
5288; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5289; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5290; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
5291; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
5292; GFX7-NEXT:    s_waitcnt vmcnt(0)
5293; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
5294; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
5295; GFX7-NEXT:    s_cbranch_execnz .LBB15_4
5296; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
5297; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
5298; GFX7-NEXT:    s_waitcnt vmcnt(0)
5299; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
5300; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5301; GFX7-NEXT:    v_mov_b32_e32 v6, v4
5302; GFX7-NEXT:    buffer_wbinvl1
5303; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5304; GFX7-NEXT:    s_cbranch_execnz .LBB15_3
5305; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
5306; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
5307; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
5308; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
5309; GFX7-NEXT:    s_setpc_b64 s[30:31]
5310;
5311; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
5312; GFX6:       ; %bb.0:
5313; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5314; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
5315; GFX6-NEXT:    v_and_b32_e32 v8, -4, v4
5316; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
5317; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
5318; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
5319; GFX6-NEXT:    v_not_b32_e32 v9, v4
5320; GFX6-NEXT:    s_mov_b64 s[6:7], exec
5321; GFX6-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
5322; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
5323; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
5324; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
5325; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
5326; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5327; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5328; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
5329; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
5330; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
5331; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
5332; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
5333; GFX6-NEXT:  ; %bb.2:
5334; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
5335; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v5
5336; GFX6-NEXT:    s_mov_b64 s[6:7], 0
5337; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
5338; GFX6-NEXT:  .LBB15_3: ; %atomicrmw.start
5339; GFX6-NEXT:    ; =>This Loop Header: Depth=1
5340; GFX6-NEXT:    ; Child Loop BB15_4 Depth 2
5341; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5342; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
5343; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5344; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
5345; GFX6-NEXT:    v_min_f32_e32 v4, v4, v10
5346; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
5347; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
5348; GFX6-NEXT:    v_and_b32_e32 v5, v6, v9
5349; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
5350; GFX6-NEXT:    v_mov_b32_e32 v4, v5
5351; GFX6-NEXT:    s_mov_b64 s[12:13], exec
5352; GFX6-NEXT:    v_mov_b32_e32 v5, v6
5353; GFX6-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
5354; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
5355; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
5356; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
5357; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
5358; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
5359; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5360; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5361; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
5362; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
5363; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5364; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
5365; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
5366; GFX6-NEXT:    s_cbranch_execnz .LBB15_4
5367; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
5368; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
5369; GFX6-NEXT:    s_waitcnt vmcnt(0)
5370; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
5371; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5372; GFX6-NEXT:    v_mov_b32_e32 v6, v4
5373; GFX6-NEXT:    buffer_wbinvl1
5374; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5375; GFX6-NEXT:    s_cbranch_execnz .LBB15_3
5376; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
5377; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
5378; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
5379; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
5380; GFX6-NEXT:    s_waitcnt expcnt(0)
5381; GFX6-NEXT:    s_setpc_b64 s[30:31]
5382  %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
5383  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
5384  ret bfloat %result
5385}
5386
5387; --------------------------------------------------------------------
5388; <2 x half>
5389; --------------------------------------------------------------------
5390
5391define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
5392; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
5393; GFX12:       ; %bb.0:
5394; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5395; GFX12-NEXT:    s_wait_expcnt 0x0
5396; GFX12-NEXT:    s_wait_samplecnt 0x0
5397; GFX12-NEXT:    s_wait_bvhcnt 0x0
5398; GFX12-NEXT:    s_wait_kmcnt 0x0
5399; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
5400; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
5401; GFX12-NEXT:    s_wait_alu 0xfffe
5402; GFX12-NEXT:    v_mov_b32_e32 v3, s4
5403; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
5404; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
5405; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
5406; GFX12-NEXT:    s_mov_b32 s4, 0
5407; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
5408; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
5409; GFX12-NEXT:    s_wait_loadcnt 0x0
5410; GFX12-NEXT:    v_mov_b32_e32 v5, v0
5411; GFX12-NEXT:    s_wait_storecnt 0x0
5412; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5413; GFX12-NEXT:    v_pk_max_num_f16 v0, v5, v5
5414; GFX12-NEXT:    v_pk_min_num_f16 v4, v0, v2
5415; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5416; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
5417; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
5418; GFX12-NEXT:    s_wait_loadcnt 0x0
5419; GFX12-NEXT:    global_inv scope:SCOPE_DEV
5420; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
5421; GFX12-NEXT:    s_wait_alu 0xfffe
5422; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
5423; GFX12-NEXT:    s_wait_alu 0xfffe
5424; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
5425; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
5426; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
5427; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5428; GFX12-NEXT:    s_wait_alu 0xfffe
5429; GFX12-NEXT:    s_setpc_b64 s[30:31]
5430;
5431; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
5432; GFX940:       ; %bb.0:
5433; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5434; GFX940-NEXT:    v_mov_b32_e32 v1, v0
5435; GFX940-NEXT:    v_mov_b32_e32 v0, s16
5436; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
5437; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
5438; GFX940-NEXT:    s_mov_b64 s[4:5], 0
5439; GFX940-NEXT:    v_pk_max_f16 v2, v1, v1
5440; GFX940-NEXT:    v_mov_b32_e32 v3, s6
5441; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
5442; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
5443; GFX940-NEXT:    s_waitcnt vmcnt(0)
5444; GFX940-NEXT:    v_mov_b32_e32 v5, v0
5445; GFX940-NEXT:    v_pk_max_f16 v0, v5, v5
5446; GFX940-NEXT:    buffer_wbl2 sc1
5447; GFX940-NEXT:    v_pk_min_f16 v4, v0, v2
5448; GFX940-NEXT:    s_nop 0
5449; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
5450; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
5451; GFX940-NEXT:    s_waitcnt vmcnt(0)
5452; GFX940-NEXT:    buffer_inv sc1
5453; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
5454; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5455; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5456; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
5457; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
5458; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
5459; GFX940-NEXT:    s_setpc_b64 s[30:31]
5460;
5461; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
5462; GFX11:       ; %bb.0:
5463; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5464; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
5465; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
5466; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
5467; GFX11-NEXT:    v_mov_b32_e32 v3, s4
5468; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
5469; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
5470; GFX11-NEXT:    s_mov_b32 s4, 0
5471; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
5472; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
5473; GFX11-NEXT:    s_waitcnt vmcnt(0)
5474; GFX11-NEXT:    v_mov_b32_e32 v5, v0
5475; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5476; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5477; GFX11-NEXT:    v_pk_max_f16 v0, v5, v5
5478; GFX11-NEXT:    v_pk_min_f16 v4, v0, v2
5479; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5480; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
5481; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
5482; GFX11-NEXT:    s_waitcnt vmcnt(0)
5483; GFX11-NEXT:    buffer_gl1_inv
5484; GFX11-NEXT:    buffer_gl0_inv
5485; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
5486; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
5487; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5488; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
5489; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
5490; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
5491; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5492; GFX11-NEXT:    s_setpc_b64 s[30:31]
5493;
5494; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
5495; GFX10:       ; %bb.0:
5496; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5497; GFX10-NEXT:    v_mov_b32_e32 v1, v0
5498; GFX10-NEXT:    v_mov_b32_e32 v0, s20
5499; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
5500; GFX10-NEXT:    v_mov_b32_e32 v3, s4
5501; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
5502; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
5503; GFX10-NEXT:    s_mov_b32 s4, 0
5504; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
5505; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
5506; GFX10-NEXT:    s_waitcnt vmcnt(0)
5507; GFX10-NEXT:    v_mov_b32_e32 v5, v0
5508; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5509; GFX10-NEXT:    v_pk_max_f16 v0, v5, v5
5510; GFX10-NEXT:    v_pk_min_f16 v4, v0, v2
5511; GFX10-NEXT:    v_mov_b32_e32 v0, v4
5512; GFX10-NEXT:    v_mov_b32_e32 v1, v5
5513; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
5514; GFX10-NEXT:    s_waitcnt vmcnt(0)
5515; GFX10-NEXT:    buffer_gl1_inv
5516; GFX10-NEXT:    buffer_gl0_inv
5517; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
5518; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
5519; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
5520; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
5521; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
5522; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5523; GFX10-NEXT:    s_setpc_b64 s[30:31]
5524;
5525; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
5526; GFX90A:       ; %bb.0:
5527; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5528; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
5529; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
5530; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
5531; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
5532; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
5533; GFX90A-NEXT:    v_pk_max_f16 v2, v1, v1
5534; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
5535; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
5536; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
5537; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5538; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
5539; GFX90A-NEXT:    v_pk_max_f16 v0, v5, v5
5540; GFX90A-NEXT:    v_pk_min_f16 v4, v0, v2
5541; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
5542; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
5543; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5544; GFX90A-NEXT:    buffer_wbinvl1
5545; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
5546; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5547; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5548; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
5549; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
5550; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5551; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5552;
5553; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
5554; GFX908:       ; %bb.0:
5555; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5556; GFX908-NEXT:    v_mov_b32_e32 v1, v0
5557; GFX908-NEXT:    v_mov_b32_e32 v0, s20
5558; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
5559; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
5560; GFX908-NEXT:    s_mov_b64 s[4:5], 0
5561; GFX908-NEXT:    v_pk_max_f16 v2, v1, v1
5562; GFX908-NEXT:    v_mov_b32_e32 v3, s6
5563; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
5564; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
5565; GFX908-NEXT:    s_waitcnt vmcnt(0)
5566; GFX908-NEXT:    v_mov_b32_e32 v5, v0
5567; GFX908-NEXT:    v_pk_max_f16 v0, v5, v5
5568; GFX908-NEXT:    v_pk_min_f16 v4, v0, v2
5569; GFX908-NEXT:    v_mov_b32_e32 v0, v4
5570; GFX908-NEXT:    v_mov_b32_e32 v1, v5
5571; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
5572; GFX908-NEXT:    s_waitcnt vmcnt(0)
5573; GFX908-NEXT:    buffer_wbinvl1
5574; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
5575; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5576; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5577; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
5578; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
5579; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5580; GFX908-NEXT:    s_setpc_b64 s[30:31]
5581;
5582; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
5583; GFX8:       ; %bb.0:
5584; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5585; GFX8-NEXT:    v_mov_b32_e32 v1, v0
5586; GFX8-NEXT:    v_mov_b32_e32 v0, s20
5587; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
5588; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
5589; GFX8-NEXT:    s_mov_b64 s[4:5], 0
5590; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
5591; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
5592; GFX8-NEXT:    v_mov_b32_e32 v4, s6
5593; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
5594; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5595; GFX8-NEXT:    s_waitcnt vmcnt(0)
5596; GFX8-NEXT:    v_mov_b32_e32 v6, v0
5597; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
5598; GFX8-NEXT:    v_max_f16_e32 v1, v6, v6
5599; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5600; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
5601; GFX8-NEXT:    v_or_b32_e32 v5, v1, v0
5602; GFX8-NEXT:    v_mov_b32_e32 v0, v5
5603; GFX8-NEXT:    v_mov_b32_e32 v1, v6
5604; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
5605; GFX8-NEXT:    s_waitcnt vmcnt(0)
5606; GFX8-NEXT:    buffer_wbinvl1
5607; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
5608; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5609; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5610; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
5611; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
5612; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5613; GFX8-NEXT:    s_setpc_b64 s[30:31]
5614;
5615; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
5616; GFX7:       ; %bb.0:
5617; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5618; GFX7-NEXT:    v_mov_b32_e32 v2, s20
5619; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
5620; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
5621; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
5622; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
5623; GFX7-NEXT:    s_mov_b64 s[4:5], 0
5624; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
5625; GFX7-NEXT:    s_waitcnt vmcnt(0)
5626; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
5627; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
5628; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
5629; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
5630; GFX7-NEXT:    v_mov_b32_e32 v4, s6
5631; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
5632; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
5633; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
5634; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
5635; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v1
5636; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
5637; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
5638; GFX7-NEXT:    v_min_f32_e32 v5, v5, v2
5639; GFX7-NEXT:    v_min_f32_e32 v6, v6, v3
5640; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
5641; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
5642; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
5643; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
5644; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
5645; GFX7-NEXT:    v_mov_b32_e32 v8, v6
5646; GFX7-NEXT:    v_mov_b32_e32 v7, v5
5647; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
5648; GFX7-NEXT:    s_waitcnt vmcnt(0)
5649; GFX7-NEXT:    buffer_wbinvl1
5650; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
5651; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v7
5652; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
5653; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
5654; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5655; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5656; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
5657; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
5658; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5659; GFX7-NEXT:    s_setpc_b64 s[30:31]
5660;
5661; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
5662; GFX6:       ; %bb.0:
5663; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5664; GFX6-NEXT:    v_mov_b32_e32 v2, s20
5665; GFX6-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
5666; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
5667; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v0
5668; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
5669; GFX6-NEXT:    s_mov_b64 s[4:5], 0
5670; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v1
5671; GFX6-NEXT:    s_waitcnt vmcnt(0)
5672; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
5673; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
5674; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
5675; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
5676; GFX6-NEXT:    v_mov_b32_e32 v4, s6
5677; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
5678; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
5679; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
5680; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
5681; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v1
5682; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v0
5683; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
5684; GFX6-NEXT:    v_min_f32_e32 v5, v5, v2
5685; GFX6-NEXT:    v_min_f32_e32 v6, v6, v3
5686; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
5687; GFX6-NEXT:    s_waitcnt expcnt(0)
5688; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
5689; GFX6-NEXT:    v_or_b32_e32 v6, v0, v1
5690; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
5691; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
5692; GFX6-NEXT:    v_mov_b32_e32 v8, v6
5693; GFX6-NEXT:    v_mov_b32_e32 v7, v5
5694; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
5695; GFX6-NEXT:    s_waitcnt vmcnt(0)
5696; GFX6-NEXT:    buffer_wbinvl1
5697; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
5698; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v7
5699; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
5700; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
5701; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5702; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5703; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
5704; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
5705; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
5706; GFX6-NEXT:    s_waitcnt expcnt(0)
5707; GFX6-NEXT:    s_setpc_b64 s[30:31]
5708  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
5709  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
5710  ret <2 x half> %result
5711}
5712
5713define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
5714; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
5715; GFX12:       ; %bb.0:
5716; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5717; GFX12-NEXT:    s_wait_expcnt 0x0
5718; GFX12-NEXT:    s_wait_samplecnt 0x0
5719; GFX12-NEXT:    s_wait_bvhcnt 0x0
5720; GFX12-NEXT:    s_wait_kmcnt 0x0
5721; GFX12-NEXT:    v_mov_b32_e32 v1, s16
5722; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
5723; GFX12-NEXT:    v_pk_max_num_f16 v2, v0, v0
5724; GFX12-NEXT:    s_wait_alu 0xfffe
5725; GFX12-NEXT:    v_mov_b32_e32 v3, s4
5726; GFX12-NEXT:    s_mov_b32 s4, 0
5727; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
5728; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
5729; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
5730; GFX12-NEXT:    s_wait_loadcnt 0x0
5731; GFX12-NEXT:    v_pk_max_num_f16 v0, v1, v1
5732; GFX12-NEXT:    s_wait_storecnt 0x0
5733; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5734; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v2
5735; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
5736; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
5737; GFX12-NEXT:    s_wait_loadcnt 0x0
5738; GFX12-NEXT:    global_inv scope:SCOPE_DEV
5739; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
5740; GFX12-NEXT:    v_mov_b32_e32 v1, v4
5741; GFX12-NEXT:    s_wait_alu 0xfffe
5742; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
5743; GFX12-NEXT:    s_wait_alu 0xfffe
5744; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
5745; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
5746; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
5747; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5748; GFX12-NEXT:    s_wait_alu 0xfffe
5749; GFX12-NEXT:    s_setpc_b64 s[30:31]
5750;
5751; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
5752; GFX940:       ; %bb.0:
5753; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5754; GFX940-NEXT:    v_mov_b32_e32 v1, s16
5755; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
5756; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
5757; GFX940-NEXT:    s_mov_b64 s[4:5], 0
5758; GFX940-NEXT:    v_pk_max_f16 v2, v0, v0
5759; GFX940-NEXT:    v_mov_b32_e32 v3, s6
5760; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
5761; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
5762; GFX940-NEXT:    s_waitcnt vmcnt(0)
5763; GFX940-NEXT:    v_pk_max_f16 v0, v1, v1
5764; GFX940-NEXT:    buffer_wbl2 sc1
5765; GFX940-NEXT:    v_pk_min_f16 v0, v0, v2
5766; GFX940-NEXT:    s_nop 0
5767; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
5768; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
5769; GFX940-NEXT:    s_waitcnt vmcnt(0)
5770; GFX940-NEXT:    buffer_inv sc1
5771; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
5772; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5773; GFX940-NEXT:    v_mov_b32_e32 v1, v4
5774; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5775; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
5776; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
5777; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
5778; GFX940-NEXT:    s_setpc_b64 s[30:31]
5779;
5780; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
5781; GFX11:       ; %bb.0:
5782; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5783; GFX11-NEXT:    v_mov_b32_e32 v1, s16
5784; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
5785; GFX11-NEXT:    v_pk_max_f16 v2, v0, v0
5786; GFX11-NEXT:    v_mov_b32_e32 v3, s4
5787; GFX11-NEXT:    s_mov_b32 s4, 0
5788; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
5789; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
5790; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
5791; GFX11-NEXT:    s_waitcnt vmcnt(0)
5792; GFX11-NEXT:    v_pk_max_f16 v0, v1, v1
5793; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5794; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5795; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
5796; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
5797; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
5798; GFX11-NEXT:    s_waitcnt vmcnt(0)
5799; GFX11-NEXT:    buffer_gl1_inv
5800; GFX11-NEXT:    buffer_gl0_inv
5801; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
5802; GFX11-NEXT:    v_mov_b32_e32 v1, v4
5803; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
5804; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5805; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
5806; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
5807; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
5808; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5809; GFX11-NEXT:    s_setpc_b64 s[30:31]
5810;
5811; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
5812; GFX10:       ; %bb.0:
5813; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5814; GFX10-NEXT:    v_mov_b32_e32 v1, s20
5815; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
5816; GFX10-NEXT:    v_pk_max_f16 v2, v0, v0
5817; GFX10-NEXT:    v_mov_b32_e32 v3, s4
5818; GFX10-NEXT:    s_mov_b32 s4, 0
5819; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
5820; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
5821; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
5822; GFX10-NEXT:    s_waitcnt vmcnt(0)
5823; GFX10-NEXT:    v_pk_max_f16 v0, v1, v1
5824; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5825; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
5826; GFX10-NEXT:    v_mov_b32_e32 v5, v1
5827; GFX10-NEXT:    v_mov_b32_e32 v4, v0
5828; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
5829; GFX10-NEXT:    s_waitcnt vmcnt(0)
5830; GFX10-NEXT:    buffer_gl1_inv
5831; GFX10-NEXT:    buffer_gl0_inv
5832; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
5833; GFX10-NEXT:    v_mov_b32_e32 v1, v4
5834; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
5835; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
5836; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
5837; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
5838; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5839; GFX10-NEXT:    s_setpc_b64 s[30:31]
5840;
5841; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
5842; GFX90A:       ; %bb.0:
5843; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5844; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
5845; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
5846; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
5847; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
5848; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v0
5849; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
5850; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
5851; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
5852; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5853; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
5854; GFX90A-NEXT:    v_pk_min_f16 v0, v0, v2
5855; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
5856; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
5857; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5858; GFX90A-NEXT:    buffer_wbinvl1
5859; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
5860; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5861; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
5862; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5863; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
5864; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
5865; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5866; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5867;
5868; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
5869; GFX908:       ; %bb.0:
5870; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5871; GFX908-NEXT:    v_mov_b32_e32 v1, s20
5872; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
5873; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
5874; GFX908-NEXT:    s_mov_b64 s[4:5], 0
5875; GFX908-NEXT:    v_pk_max_f16 v2, v0, v0
5876; GFX908-NEXT:    v_mov_b32_e32 v3, s6
5877; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
5878; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
5879; GFX908-NEXT:    s_waitcnt vmcnt(0)
5880; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
5881; GFX908-NEXT:    v_pk_min_f16 v0, v0, v2
5882; GFX908-NEXT:    v_mov_b32_e32 v5, v1
5883; GFX908-NEXT:    v_mov_b32_e32 v4, v0
5884; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
5885; GFX908-NEXT:    s_waitcnt vmcnt(0)
5886; GFX908-NEXT:    buffer_wbinvl1
5887; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
5888; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5889; GFX908-NEXT:    v_mov_b32_e32 v1, v4
5890; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5891; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
5892; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
5893; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5894; GFX908-NEXT:    s_setpc_b64 s[30:31]
5895;
5896; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
5897; GFX8:       ; %bb.0:
5898; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5899; GFX8-NEXT:    v_mov_b32_e32 v1, s20
5900; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
5901; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
5902; GFX8-NEXT:    s_mov_b64 s[4:5], 0
5903; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
5904; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
5905; GFX8-NEXT:    v_mov_b32_e32 v4, s6
5906; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
5907; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5908; GFX8-NEXT:    s_waitcnt vmcnt(0)
5909; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
5910; GFX8-NEXT:    v_max_f16_e32 v5, v1, v1
5911; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5912; GFX8-NEXT:    v_min_f16_e32 v5, v5, v3
5913; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
5914; GFX8-NEXT:    v_mov_b32_e32 v6, v1
5915; GFX8-NEXT:    v_mov_b32_e32 v5, v0
5916; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
5917; GFX8-NEXT:    s_waitcnt vmcnt(0)
5918; GFX8-NEXT:    buffer_wbinvl1
5919; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
5920; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5921; GFX8-NEXT:    v_mov_b32_e32 v1, v5
5922; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5923; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
5924; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
5925; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5926; GFX8-NEXT:    s_setpc_b64 s[30:31]
5927;
5928; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
5929; GFX7:       ; %bb.0:
5930; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5931; GFX7-NEXT:    v_mov_b32_e32 v2, s20
5932; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
5933; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
5934; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
5935; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
5936; GFX7-NEXT:    s_mov_b64 s[4:5], 0
5937; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
5938; GFX7-NEXT:    s_waitcnt vmcnt(0)
5939; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
5940; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
5941; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
5942; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
5943; GFX7-NEXT:    v_mov_b32_e32 v2, s6
5944; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
5945; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
5946; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
5947; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
5948; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v4
5949; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
5950; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5951; GFX7-NEXT:    v_min_f32_e32 v5, v5, v0
5952; GFX7-NEXT:    v_min_f32_e32 v6, v6, v1
5953; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v5
5954; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
5955; GFX7-NEXT:    v_or_b32_e32 v5, v3, v4
5956; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
5957; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
5958; GFX7-NEXT:    v_mov_b32_e32 v7, v5
5959; GFX7-NEXT:    v_mov_b32_e32 v6, v4
5960; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
5961; GFX7-NEXT:    s_waitcnt vmcnt(0)
5962; GFX7-NEXT:    buffer_wbinvl1
5963; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
5964; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
5965; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
5966; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
5967; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5968; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5969; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
5970; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
5971; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5972; GFX7-NEXT:    s_setpc_b64 s[30:31]
5973;
5974; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
5975; GFX6:       ; %bb.0:
5976; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5977; GFX6-NEXT:    v_mov_b32_e32 v2, s20
5978; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
5979; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
5980; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v0
5981; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
5982; GFX6-NEXT:    s_mov_b64 s[4:5], 0
5983; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v1
5984; GFX6-NEXT:    s_waitcnt vmcnt(0)
5985; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
5986; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
5987; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
5988; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
5989; GFX6-NEXT:    v_mov_b32_e32 v2, s6
5990; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
5991; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
5992; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
5993; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
5994; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v4
5995; GFX6-NEXT:    s_waitcnt expcnt(0)
5996; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v3
5997; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5998; GFX6-NEXT:    v_min_f32_e32 v5, v5, v0
5999; GFX6-NEXT:    v_min_f32_e32 v6, v6, v1
6000; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v5
6001; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
6002; GFX6-NEXT:    v_or_b32_e32 v5, v3, v4
6003; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
6004; GFX6-NEXT:    v_or_b32_e32 v4, v6, v3
6005; GFX6-NEXT:    v_mov_b32_e32 v7, v5
6006; GFX6-NEXT:    v_mov_b32_e32 v6, v4
6007; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
6008; GFX6-NEXT:    s_waitcnt vmcnt(0)
6009; GFX6-NEXT:    buffer_wbinvl1
6010; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
6011; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
6012; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
6013; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
6014; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6015; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6016; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
6017; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
6018; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
6019; GFX6-NEXT:    s_waitcnt expcnt(0)
6020; GFX6-NEXT:    s_setpc_b64 s[30:31]
6021  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
6022  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
6023  ret void
6024}
6025
6026define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
6027; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6028; GFX12:       ; %bb.0:
6029; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6030; GFX12-NEXT:    s_wait_expcnt 0x0
6031; GFX12-NEXT:    s_wait_samplecnt 0x0
6032; GFX12-NEXT:    s_wait_bvhcnt 0x0
6033; GFX12-NEXT:    s_wait_kmcnt 0x0
6034; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
6035; GFX12-NEXT:    s_mov_b32 s1, exec_lo
6036; GFX12-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
6037; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
6038; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
6039; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
6040; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
6041; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
6042; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
6043; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
6044; GFX12-NEXT:    s_wait_alu 0xfffe
6045; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6046; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
6047; GFX12-NEXT:    s_wait_alu 0xfffe
6048; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
6049; GFX12-NEXT:    s_wait_loadcnt 0x0
6050; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
6051; GFX12-NEXT:    ; implicit-def: $vgpr4
6052; GFX12-NEXT:    s_wait_alu 0xfffe
6053; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
6054; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
6055; GFX12-NEXT:  ; %bb.2:
6056; GFX12-NEXT:    s_mov_b32 exec_lo, s1
6057; GFX12-NEXT:    v_pk_max_num_f16 v8, v5, v5
6058; GFX12-NEXT:    s_mov_b32 s1, 0
6059; GFX12-NEXT:  .LBB18_3: ; %atomicrmw.start
6060; GFX12-NEXT:    ; =>This Loop Header: Depth=1
6061; GFX12-NEXT:    ; Child Loop BB18_4 Depth 2
6062; GFX12-NEXT:    s_wait_loadcnt 0x0
6063; GFX12-NEXT:    v_pk_max_num_f16 v4, v6, v6
6064; GFX12-NEXT:    s_mov_b32 s2, exec_lo
6065; GFX12-NEXT:    s_wait_storecnt 0x0
6066; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6067; GFX12-NEXT:    v_pk_min_num_f16 v5, v4, v8
6068; GFX12-NEXT:    v_mov_b32_e32 v4, v5
6069; GFX12-NEXT:    v_mov_b32_e32 v5, v6
6070; GFX12-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
6071; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
6072; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
6073; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
6074; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
6075; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
6076; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
6077; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
6078; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
6079; GFX12-NEXT:    s_wait_alu 0xfffe
6080; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6081; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
6082; GFX12-NEXT:    s_wait_alu 0xfffe
6083; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
6084; GFX12-NEXT:    s_wait_loadcnt 0x0
6085; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
6086; GFX12-NEXT:    s_wait_alu 0xfffe
6087; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
6088; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
6089; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
6090; GFX12-NEXT:    s_mov_b32 exec_lo, s2
6091; GFX12-NEXT:    s_wait_loadcnt 0x0
6092; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
6093; GFX12-NEXT:    v_mov_b32_e32 v6, v4
6094; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6095; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
6096; GFX12-NEXT:    s_wait_alu 0xfffe
6097; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
6098; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
6099; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
6100; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
6101; GFX12-NEXT:    v_mov_b32_e32 v0, v4
6102; GFX12-NEXT:    s_wait_alu 0xfffe
6103; GFX12-NEXT:    s_setpc_b64 s[30:31]
6104;
6105; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6106; GFX940:       ; %bb.0:
6107; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6108; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
6109; GFX940-NEXT:    s_mov_b64 s[2:3], exec
6110; GFX940-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
6111; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
6112; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
6113; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
6114; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
6115; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
6116; GFX940-NEXT:    s_nop 0
6117; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
6118; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
6119; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
6120; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
6121; GFX940-NEXT:    ; implicit-def: $vgpr4
6122; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
6123; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
6124; GFX940-NEXT:  ; %bb.2:
6125; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
6126; GFX940-NEXT:    s_mov_b64 s[2:3], 0
6127; GFX940-NEXT:    v_pk_max_f16 v9, v5, v5
6128; GFX940-NEXT:  .LBB18_3: ; %atomicrmw.start
6129; GFX940-NEXT:    ; =>This Loop Header: Depth=1
6130; GFX940-NEXT:    ; Child Loop BB18_4 Depth 2
6131; GFX940-NEXT:    s_waitcnt vmcnt(0)
6132; GFX940-NEXT:    v_pk_max_f16 v4, v7, v7
6133; GFX940-NEXT:    s_mov_b64 s[8:9], exec
6134; GFX940-NEXT:    v_pk_min_f16 v6, v4, v9
6135; GFX940-NEXT:    buffer_wbl2 sc1
6136; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
6137; GFX940-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
6138; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
6139; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
6140; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
6141; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
6142; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
6143; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
6144; GFX940-NEXT:    s_nop 0
6145; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
6146; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
6147; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
6148; GFX940-NEXT:    s_waitcnt vmcnt(0)
6149; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
6150; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
6151; GFX940-NEXT:    s_cbranch_execnz .LBB18_4
6152; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
6153; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
6154; GFX940-NEXT:    s_waitcnt vmcnt(0)
6155; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
6156; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
6157; GFX940-NEXT:    v_mov_b32_e32 v7, v4
6158; GFX940-NEXT:    buffer_inv sc1
6159; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
6160; GFX940-NEXT:    s_cbranch_execnz .LBB18_3
6161; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
6162; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
6163; GFX940-NEXT:    v_mov_b32_e32 v0, v4
6164; GFX940-NEXT:    s_setpc_b64 s[30:31]
6165;
6166; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6167; GFX11:       ; %bb.0:
6168; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6169; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
6170; GFX11-NEXT:    s_mov_b32 s1, 0
6171; GFX11-NEXT:    s_mov_b32 s2, exec_lo
6172; GFX11-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
6173; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
6174; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
6175; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
6176; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
6177; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
6178; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
6179; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
6180; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
6181; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
6182; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
6183; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
6184; GFX11-NEXT:    ; implicit-def: $vgpr4
6185; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
6186; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
6187; GFX11-NEXT:  ; %bb.2:
6188; GFX11-NEXT:    s_mov_b32 exec_lo, s2
6189; GFX11-NEXT:    v_pk_max_f16 v8, v5, v5
6190; GFX11-NEXT:    .p2align 6
6191; GFX11-NEXT:  .LBB18_3: ; %atomicrmw.start
6192; GFX11-NEXT:    ; =>This Loop Header: Depth=1
6193; GFX11-NEXT:    ; Child Loop BB18_4 Depth 2
6194; GFX11-NEXT:    s_waitcnt vmcnt(0)
6195; GFX11-NEXT:    v_pk_max_f16 v4, v6, v6
6196; GFX11-NEXT:    s_mov_b32 s2, exec_lo
6197; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
6198; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6199; GFX11-NEXT:    v_pk_min_f16 v5, v4, v8
6200; GFX11-NEXT:    v_mov_b32_e32 v4, v5
6201; GFX11-NEXT:    v_mov_b32_e32 v5, v6
6202; GFX11-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
6203; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
6204; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
6205; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
6206; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
6207; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
6208; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
6209; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
6210; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
6211; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
6212; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
6213; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
6214; GFX11-NEXT:    s_waitcnt vmcnt(0)
6215; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
6216; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
6217; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
6218; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
6219; GFX11-NEXT:    s_mov_b32 exec_lo, s2
6220; GFX11-NEXT:    s_waitcnt vmcnt(0)
6221; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
6222; GFX11-NEXT:    v_mov_b32_e32 v6, v4
6223; GFX11-NEXT:    buffer_gl1_inv
6224; GFX11-NEXT:    buffer_gl0_inv
6225; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
6226; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6227; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
6228; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
6229; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
6230; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
6231; GFX11-NEXT:    v_mov_b32_e32 v0, v4
6232; GFX11-NEXT:    s_setpc_b64 s[30:31]
6233;
6234; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6235; GFX10:       ; %bb.0:
6236; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6237; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
6238; GFX10-NEXT:    s_mov_b32 s5, 0
6239; GFX10-NEXT:    s_mov_b32 s6, exec_lo
6240; GFX10-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
6241; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
6242; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
6243; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
6244; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
6245; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
6246; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
6247; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
6248; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
6249; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
6250; GFX10-NEXT:    ; implicit-def: $vgpr4
6251; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
6252; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
6253; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
6254; GFX10-NEXT:  ; %bb.2:
6255; GFX10-NEXT:    s_mov_b32 exec_lo, s6
6256; GFX10-NEXT:    v_pk_max_f16 v8, v5, v5
6257; GFX10-NEXT:  .LBB18_3: ; %atomicrmw.start
6258; GFX10-NEXT:    ; =>This Loop Header: Depth=1
6259; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
6260; GFX10-NEXT:    s_waitcnt vmcnt(0)
6261; GFX10-NEXT:    v_pk_max_f16 v4, v6, v6
6262; GFX10-NEXT:    s_mov_b32 s6, exec_lo
6263; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
6264; GFX10-NEXT:    v_pk_min_f16 v5, v4, v8
6265; GFX10-NEXT:    v_mov_b32_e32 v4, v5
6266; GFX10-NEXT:    v_mov_b32_e32 v5, v6
6267; GFX10-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
6268; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
6269; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
6270; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
6271; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
6272; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
6273; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
6274; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
6275; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
6276; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
6277; GFX10-NEXT:    s_waitcnt vmcnt(0)
6278; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
6279; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
6280; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
6281; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
6282; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
6283; GFX10-NEXT:    s_mov_b32 exec_lo, s6
6284; GFX10-NEXT:    s_waitcnt vmcnt(0)
6285; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
6286; GFX10-NEXT:    v_mov_b32_e32 v6, v4
6287; GFX10-NEXT:    buffer_gl1_inv
6288; GFX10-NEXT:    buffer_gl0_inv
6289; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
6290; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
6291; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
6292; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
6293; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
6294; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
6295; GFX10-NEXT:    v_mov_b32_e32 v0, v4
6296; GFX10-NEXT:    s_setpc_b64 s[30:31]
6297;
6298; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6299; GFX90A:       ; %bb.0:
6300; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6301; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
6302; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
6303; GFX90A-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
6304; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
6305; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
6306; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
6307; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
6308; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6309; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6310; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6311; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6312; GFX90A-NEXT:    s_nop 0
6313; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
6314; GFX90A-NEXT:    ; implicit-def: $vgpr4
6315; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
6316; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
6317; GFX90A-NEXT:  ; %bb.2:
6318; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
6319; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
6320; GFX90A-NEXT:    v_pk_max_f16 v9, v5, v5
6321; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.start
6322; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
6323; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
6324; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6325; GFX90A-NEXT:    v_pk_max_f16 v4, v7, v7
6326; GFX90A-NEXT:    v_pk_min_f16 v6, v4, v9
6327; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
6328; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
6329; GFX90A-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
6330; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
6331; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
6332; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
6333; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
6334; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
6335; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6336; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6337; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6338; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6339; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6340; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
6341; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
6342; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
6343; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
6344; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
6345; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6346; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
6347; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6348; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
6349; GFX90A-NEXT:    buffer_wbinvl1
6350; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6351; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
6352; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
6353; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
6354; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
6355; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6356;
6357; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6358; GFX908:       ; %bb.0:
6359; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6360; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
6361; GFX908-NEXT:    s_mov_b64 s[6:7], exec
6362; GFX908-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
6363; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
6364; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
6365; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
6366; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
6367; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6368; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6369; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6370; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6371; GFX908-NEXT:    s_nop 0
6372; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
6373; GFX908-NEXT:    ; implicit-def: $vgpr4
6374; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
6375; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
6376; GFX908-NEXT:  ; %bb.2:
6377; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
6378; GFX908-NEXT:    s_mov_b64 s[6:7], 0
6379; GFX908-NEXT:    v_pk_max_f16 v8, v5, v5
6380; GFX908-NEXT:  .LBB18_3: ; %atomicrmw.start
6381; GFX908-NEXT:    ; =>This Loop Header: Depth=1
6382; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
6383; GFX908-NEXT:    s_waitcnt vmcnt(0)
6384; GFX908-NEXT:    v_pk_max_f16 v4, v6, v6
6385; GFX908-NEXT:    v_pk_min_f16 v5, v4, v8
6386; GFX908-NEXT:    v_mov_b32_e32 v4, v5
6387; GFX908-NEXT:    s_mov_b64 s[12:13], exec
6388; GFX908-NEXT:    v_mov_b32_e32 v5, v6
6389; GFX908-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
6390; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
6391; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
6392; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
6393; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
6394; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
6395; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6396; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6397; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6398; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6399; GFX908-NEXT:    s_waitcnt vmcnt(0)
6400; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
6401; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
6402; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
6403; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
6404; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
6405; GFX908-NEXT:    s_waitcnt vmcnt(0)
6406; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
6407; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6408; GFX908-NEXT:    v_mov_b32_e32 v6, v4
6409; GFX908-NEXT:    buffer_wbinvl1
6410; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6411; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
6412; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
6413; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
6414; GFX908-NEXT:    v_mov_b32_e32 v0, v4
6415; GFX908-NEXT:    s_setpc_b64 s[30:31]
6416;
6417; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6418; GFX8:       ; %bb.0:
6419; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6420; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
6421; GFX8-NEXT:    s_mov_b64 s[6:7], exec
6422; GFX8-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
6423; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
6424; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
6425; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
6426; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
6427; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6428; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6429; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6430; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6431; GFX8-NEXT:    s_nop 0
6432; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
6433; GFX8-NEXT:    ; implicit-def: $vgpr4
6434; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
6435; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
6436; GFX8-NEXT:  ; %bb.2:
6437; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
6438; GFX8-NEXT:    s_mov_b64 s[6:7], 0
6439; GFX8-NEXT:    v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
6440; GFX8-NEXT:    v_max_f16_e32 v9, v5, v5
6441; GFX8-NEXT:  .LBB18_3: ; %atomicrmw.start
6442; GFX8-NEXT:    ; =>This Loop Header: Depth=1
6443; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
6444; GFX8-NEXT:    s_waitcnt vmcnt(0)
6445; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
6446; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
6447; GFX8-NEXT:    v_min_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
6448; GFX8-NEXT:    v_min_f16_e32 v5, v5, v9
6449; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
6450; GFX8-NEXT:    v_mov_b32_e32 v4, v5
6451; GFX8-NEXT:    s_mov_b64 s[12:13], exec
6452; GFX8-NEXT:    v_mov_b32_e32 v5, v6
6453; GFX8-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
6454; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
6455; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
6456; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
6457; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
6458; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
6459; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6460; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6461; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6462; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6463; GFX8-NEXT:    s_waitcnt vmcnt(0)
6464; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
6465; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
6466; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
6467; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
6468; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
6469; GFX8-NEXT:    s_waitcnt vmcnt(0)
6470; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
6471; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6472; GFX8-NEXT:    v_mov_b32_e32 v6, v4
6473; GFX8-NEXT:    buffer_wbinvl1
6474; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6475; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
6476; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
6477; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
6478; GFX8-NEXT:    v_mov_b32_e32 v0, v4
6479; GFX8-NEXT:    s_setpc_b64 s[30:31]
6480;
6481; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6482; GFX7:       ; %bb.0:
6483; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6484; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
6485; GFX7-NEXT:    s_mov_b64 s[6:7], exec
6486; GFX7-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
6487; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
6488; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
6489; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
6490; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
6491; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6492; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6493; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6494; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6495; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
6496; GFX7-NEXT:    ; implicit-def: $vgpr4
6497; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
6498; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
6499; GFX7-NEXT:  ; %bb.2:
6500; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
6501; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
6502; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v5
6503; GFX7-NEXT:    s_waitcnt vmcnt(0)
6504; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
6505; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
6506; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
6507; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v6
6508; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v8
6509; GFX7-NEXT:    s_mov_b64 s[6:7], 0
6510; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.start
6511; GFX7-NEXT:    ; =>This Loop Header: Depth=1
6512; GFX7-NEXT:    ; Child Loop BB18_4 Depth 2
6513; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
6514; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
6515; GFX7-NEXT:    s_mov_b64 s[12:13], exec
6516; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
6517; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
6518; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
6519; GFX7-NEXT:    v_min_f32_e32 v6, v6, v10
6520; GFX7-NEXT:    v_min_f32_e32 v7, v7, v11
6521; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
6522; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
6523; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
6524; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
6525; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
6526; GFX7-NEXT:    v_mov_b32_e32 v8, v6
6527; GFX7-NEXT:    v_mov_b32_e32 v7, v5
6528; GFX7-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
6529; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
6530; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
6531; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
6532; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
6533; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
6534; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6535; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6536; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6537; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6538; GFX7-NEXT:    s_waitcnt vmcnt(0)
6539; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
6540; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
6541; GFX7-NEXT:    s_cbranch_execnz .LBB18_4
6542; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
6543; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
6544; GFX7-NEXT:    s_waitcnt vmcnt(0)
6545; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
6546; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
6547; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
6548; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
6549; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6550; GFX7-NEXT:    buffer_wbinvl1
6551; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6552; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
6553; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
6554; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
6555; GFX7-NEXT:    v_mov_b32_e32 v0, v4
6556; GFX7-NEXT:    v_mov_b32_e32 v1, v5
6557; GFX7-NEXT:    s_setpc_b64 s[30:31]
6558;
6559; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
6560; GFX6:       ; %bb.0:
6561; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6562; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
6563; GFX6-NEXT:    s_mov_b64 s[6:7], exec
6564; GFX6-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
6565; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
6566; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
6567; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
6568; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
6569; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6570; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6571; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6572; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6573; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
6574; GFX6-NEXT:    ; implicit-def: $vgpr4
6575; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
6576; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
6577; GFX6-NEXT:  ; %bb.2:
6578; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
6579; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
6580; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v5
6581; GFX6-NEXT:    s_waitcnt vmcnt(0)
6582; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
6583; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
6584; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
6585; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v6
6586; GFX6-NEXT:    v_cvt_f32_f16_e32 v11, v8
6587; GFX6-NEXT:    s_mov_b64 s[6:7], 0
6588; GFX6-NEXT:  .LBB18_3: ; %atomicrmw.start
6589; GFX6-NEXT:    ; =>This Loop Header: Depth=1
6590; GFX6-NEXT:    ; Child Loop BB18_4 Depth 2
6591; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
6592; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
6593; GFX6-NEXT:    s_mov_b64 s[12:13], exec
6594; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v5
6595; GFX6-NEXT:    s_waitcnt expcnt(0)
6596; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v4
6597; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
6598; GFX6-NEXT:    v_min_f32_e32 v6, v6, v10
6599; GFX6-NEXT:    v_min_f32_e32 v7, v7, v11
6600; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v6
6601; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
6602; GFX6-NEXT:    v_or_b32_e32 v6, v4, v5
6603; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
6604; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
6605; GFX6-NEXT:    v_mov_b32_e32 v8, v6
6606; GFX6-NEXT:    v_mov_b32_e32 v7, v5
6607; GFX6-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
6608; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
6609; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
6610; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
6611; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
6612; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
6613; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
6614; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
6615; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
6616; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
6617; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6618; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
6619; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
6620; GFX6-NEXT:    s_cbranch_execnz .LBB18_4
6621; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
6622; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
6623; GFX6-NEXT:    s_waitcnt vmcnt(0)
6624; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
6625; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v7
6626; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
6627; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
6628; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6629; GFX6-NEXT:    buffer_wbinvl1
6630; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6631; GFX6-NEXT:    s_cbranch_execnz .LBB18_3
6632; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
6633; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
6634; GFX6-NEXT:    v_mov_b32_e32 v0, v4
6635; GFX6-NEXT:    v_mov_b32_e32 v1, v5
6636; GFX6-NEXT:    s_waitcnt expcnt(0)
6637; GFX6-NEXT:    s_setpc_b64 s[30:31]
6638  %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
6639  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
6640  ret <2 x half> %result
6641}
6642
6643; --------------------------------------------------------------------
6644; <2 x bfloat>
6645; --------------------------------------------------------------------
6646
6647define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
6648; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
6649; GFX12:       ; %bb.0:
6650; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6651; GFX12-NEXT:    s_wait_expcnt 0x0
6652; GFX12-NEXT:    s_wait_samplecnt 0x0
6653; GFX12-NEXT:    s_wait_bvhcnt 0x0
6654; GFX12-NEXT:    s_wait_kmcnt 0x0
6655; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
6656; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
6657; GFX12-NEXT:    s_mov_b32 s5, 0
6658; GFX12-NEXT:    s_wait_alu 0xfffe
6659; GFX12-NEXT:    v_mov_b32_e32 v4, s4
6660; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
6661; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
6662; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
6663; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
6664; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
6665; GFX12-NEXT:    s_wait_loadcnt 0x0
6666; GFX12-NEXT:    v_mov_b32_e32 v6, v0
6667; GFX12-NEXT:    s_wait_storecnt 0x0
6668; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6669; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
6670; GFX12-NEXT:    v_min_num_f32_e32 v1, v1, v3
6671; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
6672; GFX12-NEXT:    v_bfe_u32 v7, v1, 16, 1
6673; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v1
6674; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
6675; GFX12-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
6676; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6677; GFX12-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
6678; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v2
6679; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
6680; GFX12-NEXT:    v_bfe_u32 v5, v0, 16, 1
6681; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v0
6682; GFX12-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
6683; GFX12-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
6684; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6685; GFX12-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
6686; GFX12-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
6687; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6688; GFX12-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
6689; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
6690; GFX12-NEXT:    s_wait_loadcnt 0x0
6691; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6692; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
6693; GFX12-NEXT:    s_wait_alu 0xfffe
6694; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
6695; GFX12-NEXT:    s_wait_alu 0xfffe
6696; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
6697; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
6698; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
6699; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
6700; GFX12-NEXT:    s_wait_alu 0xfffe
6701; GFX12-NEXT:    s_setpc_b64 s[30:31]
6702;
6703; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
6704; GFX940:       ; %bb.0:
6705; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6706; GFX940-NEXT:    v_mov_b32_e32 v1, v0
6707; GFX940-NEXT:    v_mov_b32_e32 v0, s16
6708; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
6709; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
6710; GFX940-NEXT:    s_mov_b64 s[6:7], 0
6711; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
6712; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
6713; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
6714; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
6715; GFX940-NEXT:    v_mov_b32_e32 v4, s4
6716; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
6717; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
6718; GFX940-NEXT:    s_waitcnt vmcnt(0)
6719; GFX940-NEXT:    v_mov_b32_e32 v7, v0
6720; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
6721; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
6722; GFX940-NEXT:    v_min_f32_e32 v0, v0, v2
6723; GFX940-NEXT:    v_min_f32_e32 v1, v1, v3
6724; GFX940-NEXT:    v_bfe_u32 v5, v0, 16, 1
6725; GFX940-NEXT:    v_bfe_u32 v8, v1, 16, 1
6726; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v0
6727; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v1
6728; GFX940-NEXT:    v_add3_u32 v5, v5, v0, s8
6729; GFX940-NEXT:    v_add3_u32 v8, v8, v1, s8
6730; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
6731; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
6732; GFX940-NEXT:    buffer_wbl2 sc1
6733; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
6734; GFX940-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
6735; GFX940-NEXT:    v_perm_b32 v6, v1, v0, s9
6736; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
6737; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
6738; GFX940-NEXT:    s_waitcnt vmcnt(0)
6739; GFX940-NEXT:    buffer_inv sc1
6740; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
6741; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6742; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6743; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
6744; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
6745; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
6746; GFX940-NEXT:    s_setpc_b64 s[30:31]
6747;
6748; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
6749; GFX11:       ; %bb.0:
6750; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6751; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
6752; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
6753; GFX11-NEXT:    s_mov_b32 s5, 0
6754; GFX11-NEXT:    v_mov_b32_e32 v4, s4
6755; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
6756; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
6757; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
6758; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
6759; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
6760; GFX11-NEXT:    .p2align 6
6761; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
6762; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
6763; GFX11-NEXT:    s_waitcnt vmcnt(0)
6764; GFX11-NEXT:    v_mov_b32_e32 v6, v0
6765; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
6766; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6767; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
6768; GFX11-NEXT:    v_min_f32_e32 v1, v1, v3
6769; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
6770; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
6771; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
6772; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
6773; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
6774; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6775; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
6776; GFX11-NEXT:    v_min_f32_e32 v0, v0, v2
6777; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
6778; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
6779; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
6780; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
6781; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
6782; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6783; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
6784; GFX11-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
6785; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6786; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
6787; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
6788; GFX11-NEXT:    s_waitcnt vmcnt(0)
6789; GFX11-NEXT:    buffer_gl1_inv
6790; GFX11-NEXT:    buffer_gl0_inv
6791; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
6792; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
6793; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6794; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
6795; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
6796; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
6797; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
6798; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
6799; GFX11-NEXT:    s_setpc_b64 s[30:31]
6800;
6801; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
6802; GFX10:       ; %bb.0:
6803; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6804; GFX10-NEXT:    v_mov_b32_e32 v1, v0
6805; GFX10-NEXT:    v_mov_b32_e32 v0, s20
6806; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
6807; GFX10-NEXT:    s_mov_b32 s5, 0
6808; GFX10-NEXT:    v_mov_b32_e32 v4, s4
6809; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
6810; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
6811; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
6812; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
6813; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
6814; GFX10-NEXT:    s_waitcnt vmcnt(0)
6815; GFX10-NEXT:    v_mov_b32_e32 v6, v0
6816; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
6817; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
6818; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
6819; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
6820; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
6821; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
6822; GFX10-NEXT:    v_bfe_u32 v7, v1, 16, 1
6823; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
6824; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
6825; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
6826; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
6827; GFX10-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
6828; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
6829; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc_lo
6830; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v8, s4
6831; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
6832; GFX10-NEXT:    v_mov_b32_e32 v0, v5
6833; GFX10-NEXT:    v_mov_b32_e32 v1, v6
6834; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
6835; GFX10-NEXT:    s_waitcnt vmcnt(0)
6836; GFX10-NEXT:    buffer_gl1_inv
6837; GFX10-NEXT:    buffer_gl0_inv
6838; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
6839; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
6840; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
6841; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
6842; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
6843; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
6844; GFX10-NEXT:    s_setpc_b64 s[30:31]
6845;
6846; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
6847; GFX90A:       ; %bb.0:
6848; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6849; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
6850; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
6851; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
6852; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
6853; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
6854; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
6855; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
6856; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
6857; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
6858; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
6859; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
6860; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
6861; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6862; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
6863; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
6864; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
6865; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
6866; GFX90A-NEXT:    v_min_f32_e32 v1, v1, v3
6867; GFX90A-NEXT:    v_bfe_u32 v5, v0, 16, 1
6868; GFX90A-NEXT:    v_bfe_u32 v8, v1, 16, 1
6869; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v0
6870; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v1
6871; GFX90A-NEXT:    v_add3_u32 v5, v5, v0, s8
6872; GFX90A-NEXT:    v_add3_u32 v8, v8, v1, s8
6873; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
6874; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
6875; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
6876; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
6877; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
6878; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
6879; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
6880; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6881; GFX90A-NEXT:    buffer_wbinvl1
6882; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
6883; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6884; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6885; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
6886; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
6887; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
6888; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6889;
6890; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
6891; GFX908:       ; %bb.0:
6892; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6893; GFX908-NEXT:    v_mov_b32_e32 v1, v0
6894; GFX908-NEXT:    v_mov_b32_e32 v0, s20
6895; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
6896; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
6897; GFX908-NEXT:    s_mov_b64 s[6:7], 0
6898; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
6899; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
6900; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
6901; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
6902; GFX908-NEXT:    v_mov_b32_e32 v4, s4
6903; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
6904; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
6905; GFX908-NEXT:    s_waitcnt vmcnt(0)
6906; GFX908-NEXT:    v_mov_b32_e32 v6, v0
6907; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
6908; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
6909; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
6910; GFX908-NEXT:    v_min_f32_e32 v1, v1, v3
6911; GFX908-NEXT:    v_bfe_u32 v5, v0, 16, 1
6912; GFX908-NEXT:    v_bfe_u32 v8, v1, 16, 1
6913; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
6914; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v1
6915; GFX908-NEXT:    v_add3_u32 v5, v5, v0, s8
6916; GFX908-NEXT:    v_add3_u32 v8, v8, v1, s8
6917; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
6918; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
6919; GFX908-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
6920; GFX908-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
6921; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
6922; GFX908-NEXT:    v_mov_b32_e32 v0, v5
6923; GFX908-NEXT:    v_mov_b32_e32 v1, v6
6924; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
6925; GFX908-NEXT:    s_waitcnt vmcnt(0)
6926; GFX908-NEXT:    buffer_wbinvl1
6927; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
6928; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6929; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6930; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
6931; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
6932; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
6933; GFX908-NEXT:    s_setpc_b64 s[30:31]
6934;
6935; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
6936; GFX8:       ; %bb.0:
6937; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6938; GFX8-NEXT:    v_mov_b32_e32 v1, v0
6939; GFX8-NEXT:    v_mov_b32_e32 v0, s20
6940; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
6941; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
6942; GFX8-NEXT:    s_mov_b64 s[6:7], 0
6943; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
6944; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
6945; GFX8-NEXT:    v_mov_b32_e32 v4, s4
6946; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
6947; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
6948; GFX8-NEXT:    s_waitcnt vmcnt(0)
6949; GFX8-NEXT:    v_mov_b32_e32 v6, v0
6950; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
6951; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
6952; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
6953; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
6954; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
6955; GFX8-NEXT:    v_bfe_u32 v8, v1, 16, 1
6956; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
6957; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
6958; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
6959; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
6960; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
6961; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
6962; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
6963; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
6964; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
6965; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v7, s[4:5]
6966; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
6967; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
6968; GFX8-NEXT:    v_mov_b32_e32 v0, v5
6969; GFX8-NEXT:    v_mov_b32_e32 v1, v6
6970; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
6971; GFX8-NEXT:    s_waitcnt vmcnt(0)
6972; GFX8-NEXT:    buffer_wbinvl1
6973; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
6974; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
6975; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
6976; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
6977; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
6978; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
6979; GFX8-NEXT:    s_setpc_b64 s[30:31]
6980;
6981; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
6982; GFX7:       ; %bb.0:
6983; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6984; GFX7-NEXT:    v_mov_b32_e32 v2, s20
6985; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
6986; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
6987; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
6988; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
6989; GFX7-NEXT:    s_mov_b64 s[4:5], 0
6990; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
6991; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
6992; GFX7-NEXT:    s_waitcnt vmcnt(0)
6993; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
6994; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
6995; GFX7-NEXT:    v_mov_b32_e32 v4, s6
6996; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
6997; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
6998; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
6999; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
7000; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
7001; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
7002; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
7003; GFX7-NEXT:    v_min_f32_e32 v5, v5, v2
7004; GFX7-NEXT:    v_min_f32_e32 v6, v6, v3
7005; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
7006; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
7007; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
7008; GFX7-NEXT:    v_mov_b32_e32 v6, v1
7009; GFX7-NEXT:    v_mov_b32_e32 v5, v0
7010; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
7011; GFX7-NEXT:    s_waitcnt vmcnt(0)
7012; GFX7-NEXT:    buffer_wbinvl1
7013; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
7014; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
7015; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7016; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
7017; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7018; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
7019; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
7020; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7021; GFX7-NEXT:    s_setpc_b64 s[30:31]
7022;
7023; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
7024; GFX6:       ; %bb.0:
7025; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7026; GFX6-NEXT:    v_mov_b32_e32 v2, s20
7027; GFX6-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
7028; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
7029; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
7030; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
7031; GFX6-NEXT:    s_mov_b64 s[4:5], 0
7032; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
7033; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
7034; GFX6-NEXT:    s_waitcnt vmcnt(0)
7035; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
7036; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
7037; GFX6-NEXT:    v_mov_b32_e32 v4, s6
7038; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
7039; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
7040; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
7041; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
7042; GFX6-NEXT:    s_waitcnt expcnt(0)
7043; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
7044; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
7045; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
7046; GFX6-NEXT:    v_min_f32_e32 v5, v5, v2
7047; GFX6-NEXT:    v_min_f32_e32 v6, v6, v3
7048; GFX6-NEXT:    v_alignbit_b32 v1, v1, v0, 16
7049; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
7050; GFX6-NEXT:    v_alignbit_b32 v0, v0, v6, 16
7051; GFX6-NEXT:    v_mov_b32_e32 v6, v1
7052; GFX6-NEXT:    v_mov_b32_e32 v5, v0
7053; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
7054; GFX6-NEXT:    s_waitcnt vmcnt(0)
7055; GFX6-NEXT:    buffer_wbinvl1
7056; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
7057; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
7058; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7059; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
7060; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7061; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
7062; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
7063; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
7064; GFX6-NEXT:    s_waitcnt expcnt(0)
7065; GFX6-NEXT:    s_setpc_b64 s[30:31]
7066  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
7067  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
7068  ret <2 x bfloat> %result
7069}
7070
7071define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
7072; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
7073; GFX12:       ; %bb.0:
7074; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7075; GFX12-NEXT:    s_wait_expcnt 0x0
7076; GFX12-NEXT:    s_wait_samplecnt 0x0
7077; GFX12-NEXT:    s_wait_bvhcnt 0x0
7078; GFX12-NEXT:    s_wait_kmcnt 0x0
7079; GFX12-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
7080; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
7081; GFX12-NEXT:    s_wait_alu 0xfffe
7082; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
7083; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
7084; GFX12-NEXT:    s_mov_b32 s5, 0
7085; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
7086; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
7087; GFX12-NEXT:    s_wait_loadcnt 0x0
7088; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
7089; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
7090; GFX12-NEXT:    s_wait_storecnt 0x0
7091; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7092; GFX12-NEXT:    v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2
7093; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
7094; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
7095; GFX12-NEXT:    v_bfe_u32 v6, v0, 16, 1
7096; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v0
7097; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
7098; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
7099; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
7100; GFX12-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
7101; GFX12-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
7102; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
7103; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
7104; GFX12-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
7105; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7106; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
7107; GFX12-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
7108; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
7109; GFX12-NEXT:    s_wait_loadcnt 0x0
7110; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7111; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
7112; GFX12-NEXT:    v_mov_b32_e32 v1, v5
7113; GFX12-NEXT:    s_wait_alu 0xfffe
7114; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
7115; GFX12-NEXT:    s_wait_alu 0xfffe
7116; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
7117; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
7118; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
7119; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
7120; GFX12-NEXT:    s_wait_alu 0xfffe
7121; GFX12-NEXT:    s_setpc_b64 s[30:31]
7122;
7123; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
7124; GFX940:       ; %bb.0:
7125; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7126; GFX940-NEXT:    v_mov_b32_e32 v1, s16
7127; GFX940-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
7128; GFX940-NEXT:    s_add_i32 s4, s16, 0x400
7129; GFX940-NEXT:    s_mov_b64 s[6:7], 0
7130; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
7131; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
7132; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
7133; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
7134; GFX940-NEXT:    v_mov_b32_e32 v4, s4
7135; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
7136; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
7137; GFX940-NEXT:    s_waitcnt vmcnt(0)
7138; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
7139; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
7140; GFX940-NEXT:    v_min_f32_e32 v0, v0, v2
7141; GFX940-NEXT:    v_min_f32_e32 v5, v5, v3
7142; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
7143; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
7144; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v0
7145; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
7146; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s8
7147; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s8
7148; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
7149; GFX940-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
7150; GFX940-NEXT:    buffer_wbl2 sc1
7151; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
7152; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
7153; GFX940-NEXT:    v_perm_b32 v0, v5, v0, s9
7154; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
7155; GFX940-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
7156; GFX940-NEXT:    s_waitcnt vmcnt(0)
7157; GFX940-NEXT:    buffer_inv sc1
7158; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
7159; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7160; GFX940-NEXT:    v_mov_b32_e32 v1, v6
7161; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7162; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
7163; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
7164; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
7165; GFX940-NEXT:    s_setpc_b64 s[30:31]
7166;
7167; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
7168; GFX11:       ; %bb.0:
7169; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7170; GFX11-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
7171; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
7172; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7173; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
7174; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
7175; GFX11-NEXT:    s_mov_b32 s5, 0
7176; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
7177; GFX11-NEXT:    .p2align 6
7178; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
7179; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7180; GFX11-NEXT:    s_waitcnt vmcnt(0)
7181; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
7182; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
7183; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7184; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7185; GFX11-NEXT:    v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2
7186; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
7187; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
7188; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
7189; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
7190; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
7191; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
7192; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
7193; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
7194; GFX11-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
7195; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
7196; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
7197; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
7198; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7199; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
7200; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
7201; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
7202; GFX11-NEXT:    s_waitcnt vmcnt(0)
7203; GFX11-NEXT:    buffer_gl1_inv
7204; GFX11-NEXT:    buffer_gl0_inv
7205; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
7206; GFX11-NEXT:    v_mov_b32_e32 v1, v5
7207; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
7208; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7209; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
7210; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
7211; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
7212; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
7213; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
7214; GFX11-NEXT:    s_setpc_b64 s[30:31]
7215;
7216; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
7217; GFX10:       ; %bb.0:
7218; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7219; GFX10-NEXT:    v_mov_b32_e32 v1, s20
7220; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
7221; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
7222; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
7223; GFX10-NEXT:    v_mov_b32_e32 v4, s4
7224; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
7225; GFX10-NEXT:    s_mov_b32 s5, 0
7226; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
7227; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7228; GFX10-NEXT:    s_waitcnt vmcnt(0)
7229; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
7230; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
7231; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7232; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
7233; GFX10-NEXT:    v_min_f32_e32 v5, v5, v3
7234; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
7235; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
7236; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
7237; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
7238; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
7239; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
7240; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
7241; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
7242; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
7243; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
7244; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
7245; GFX10-NEXT:    v_mov_b32_e32 v6, v1
7246; GFX10-NEXT:    v_mov_b32_e32 v5, v0
7247; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
7248; GFX10-NEXT:    s_waitcnt vmcnt(0)
7249; GFX10-NEXT:    buffer_gl1_inv
7250; GFX10-NEXT:    buffer_gl0_inv
7251; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
7252; GFX10-NEXT:    v_mov_b32_e32 v1, v5
7253; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
7254; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
7255; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
7256; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
7257; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
7258; GFX10-NEXT:    s_setpc_b64 s[30:31]
7259;
7260; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
7261; GFX90A:       ; %bb.0:
7262; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7263; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
7264; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
7265; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
7266; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
7267; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
7268; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
7269; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
7270; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
7271; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
7272; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
7273; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
7274; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7275; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
7276; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
7277; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
7278; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v3
7279; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
7280; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
7281; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
7282; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
7283; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
7284; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
7285; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
7286; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
7287; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
7288; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
7289; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
7290; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
7291; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
7292; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7293; GFX90A-NEXT:    buffer_wbinvl1
7294; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
7295; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7296; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
7297; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7298; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
7299; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
7300; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
7301; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7302;
7303; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
7304; GFX908:       ; %bb.0:
7305; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7306; GFX908-NEXT:    v_mov_b32_e32 v1, s20
7307; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
7308; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
7309; GFX908-NEXT:    s_mov_b64 s[6:7], 0
7310; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
7311; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
7312; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
7313; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
7314; GFX908-NEXT:    v_mov_b32_e32 v4, s4
7315; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
7316; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7317; GFX908-NEXT:    s_waitcnt vmcnt(0)
7318; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
7319; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
7320; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
7321; GFX908-NEXT:    v_min_f32_e32 v5, v5, v3
7322; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
7323; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
7324; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
7325; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
7326; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
7327; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
7328; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
7329; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
7330; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
7331; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
7332; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
7333; GFX908-NEXT:    v_mov_b32_e32 v6, v1
7334; GFX908-NEXT:    v_mov_b32_e32 v5, v0
7335; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
7336; GFX908-NEXT:    s_waitcnt vmcnt(0)
7337; GFX908-NEXT:    buffer_wbinvl1
7338; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
7339; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7340; GFX908-NEXT:    v_mov_b32_e32 v1, v5
7341; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7342; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
7343; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
7344; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
7345; GFX908-NEXT:    s_setpc_b64 s[30:31]
7346;
7347; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
7348; GFX8:       ; %bb.0:
7349; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7350; GFX8-NEXT:    v_mov_b32_e32 v1, s20
7351; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
7352; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
7353; GFX8-NEXT:    s_mov_b64 s[6:7], 0
7354; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
7355; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
7356; GFX8-NEXT:    v_mov_b32_e32 v4, s4
7357; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
7358; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
7359; GFX8-NEXT:    s_waitcnt vmcnt(0)
7360; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
7361; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
7362; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
7363; GFX8-NEXT:    v_min_f32_e32 v5, v5, v3
7364; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
7365; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
7366; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
7367; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
7368; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
7369; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
7370; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
7371; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
7372; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
7373; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
7374; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
7375; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
7376; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
7377; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
7378; GFX8-NEXT:    v_mov_b32_e32 v6, v1
7379; GFX8-NEXT:    v_mov_b32_e32 v5, v0
7380; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
7381; GFX8-NEXT:    s_waitcnt vmcnt(0)
7382; GFX8-NEXT:    buffer_wbinvl1
7383; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
7384; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7385; GFX8-NEXT:    v_mov_b32_e32 v1, v5
7386; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7387; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
7388; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
7389; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
7390; GFX8-NEXT:    s_setpc_b64 s[30:31]
7391;
7392; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
7393; GFX7:       ; %bb.0:
7394; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7395; GFX7-NEXT:    v_mov_b32_e32 v2, s20
7396; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
7397; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
7398; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
7399; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v0
7400; GFX7-NEXT:    s_mov_b64 s[4:5], 0
7401; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v1
7402; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
7403; GFX7-NEXT:    s_waitcnt vmcnt(0)
7404; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
7405; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
7406; GFX7-NEXT:    v_mov_b32_e32 v2, s6
7407; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
7408; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
7409; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
7410; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
7411; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
7412; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
7413; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
7414; GFX7-NEXT:    v_min_f32_e32 v5, v5, v0
7415; GFX7-NEXT:    v_min_f32_e32 v6, v6, v1
7416; GFX7-NEXT:    v_alignbit_b32 v4, v4, v3, 16
7417; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
7418; GFX7-NEXT:    v_alignbit_b32 v3, v3, v6, 16
7419; GFX7-NEXT:    v_mov_b32_e32 v6, v4
7420; GFX7-NEXT:    v_mov_b32_e32 v5, v3
7421; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
7422; GFX7-NEXT:    s_waitcnt vmcnt(0)
7423; GFX7-NEXT:    buffer_wbinvl1
7424; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
7425; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
7426; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7427; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
7428; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7429; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
7430; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
7431; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7432; GFX7-NEXT:    s_setpc_b64 s[30:31]
7433;
7434; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
7435; GFX6:       ; %bb.0:
7436; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7437; GFX6-NEXT:    v_mov_b32_e32 v2, s20
7438; GFX6-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
7439; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
7440; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
7441; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v0
7442; GFX6-NEXT:    s_mov_b64 s[4:5], 0
7443; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v1
7444; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
7445; GFX6-NEXT:    s_waitcnt vmcnt(0)
7446; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
7447; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
7448; GFX6-NEXT:    v_mov_b32_e32 v2, s6
7449; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
7450; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
7451; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
7452; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
7453; GFX6-NEXT:    s_waitcnt expcnt(0)
7454; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
7455; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
7456; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
7457; GFX6-NEXT:    v_min_f32_e32 v5, v5, v0
7458; GFX6-NEXT:    v_min_f32_e32 v6, v6, v1
7459; GFX6-NEXT:    v_alignbit_b32 v4, v4, v3, 16
7460; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
7461; GFX6-NEXT:    v_alignbit_b32 v3, v3, v6, 16
7462; GFX6-NEXT:    v_mov_b32_e32 v6, v4
7463; GFX6-NEXT:    v_mov_b32_e32 v5, v3
7464; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
7465; GFX6-NEXT:    s_waitcnt vmcnt(0)
7466; GFX6-NEXT:    buffer_wbinvl1
7467; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
7468; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
7469; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7470; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
7471; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7472; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
7473; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
7474; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
7475; GFX6-NEXT:    s_waitcnt expcnt(0)
7476; GFX6-NEXT:    s_setpc_b64 s[30:31]
7477  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
7478  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
7479  ret void
7480}
7481
7482define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
7483; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
7484; GFX12:       ; %bb.0:
7485; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7486; GFX12-NEXT:    s_wait_expcnt 0x0
7487; GFX12-NEXT:    s_wait_samplecnt 0x0
7488; GFX12-NEXT:    s_wait_bvhcnt 0x0
7489; GFX12-NEXT:    s_wait_kmcnt 0x0
7490; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
7491; GFX12-NEXT:    s_mov_b32 s1, exec_lo
7492; GFX12-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
7493; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
7494; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
7495; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
7496; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
7497; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
7498; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
7499; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
7500; GFX12-NEXT:    s_wait_alu 0xfffe
7501; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7502; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
7503; GFX12-NEXT:    s_wait_alu 0xfffe
7504; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
7505; GFX12-NEXT:    s_wait_loadcnt 0x0
7506; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
7507; GFX12-NEXT:    ; implicit-def: $vgpr4
7508; GFX12-NEXT:    s_wait_alu 0xfffe
7509; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
7510; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
7511; GFX12-NEXT:  ; %bb.2:
7512; GFX12-NEXT:    s_mov_b32 exec_lo, s1
7513; GFX12-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
7514; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
7515; GFX12-NEXT:    s_mov_b32 s1, 0
7516; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.start
7517; GFX12-NEXT:    ; =>This Loop Header: Depth=1
7518; GFX12-NEXT:    ; Child Loop BB21_4 Depth 2
7519; GFX12-NEXT:    s_wait_loadcnt 0x0
7520; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
7521; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
7522; GFX12-NEXT:    s_mov_b32 s2, exec_lo
7523; GFX12-NEXT:    s_wait_storecnt 0x0
7524; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7525; GFX12-NEXT:    v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8
7526; GFX12-NEXT:    v_bfe_u32 v11, v5, 16, 1
7527; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
7528; GFX12-NEXT:    v_bfe_u32 v10, v4, 16, 1
7529; GFX12-NEXT:    v_or_b32_e32 v12, 0x400000, v4
7530; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
7531; GFX12-NEXT:    v_or_b32_e32 v13, 0x400000, v5
7532; GFX12-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
7533; GFX12-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
7534; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
7535; GFX12-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
7536; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
7537; GFX12-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
7538; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7539; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
7540; GFX12-NEXT:    v_mov_b32_e32 v4, v5
7541; GFX12-NEXT:    v_mov_b32_e32 v5, v6
7542; GFX12-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
7543; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
7544; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
7545; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
7546; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
7547; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
7548; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
7549; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
7550; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
7551; GFX12-NEXT:    s_wait_alu 0xfffe
7552; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7553; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
7554; GFX12-NEXT:    s_wait_alu 0xfffe
7555; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
7556; GFX12-NEXT:    s_wait_loadcnt 0x0
7557; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
7558; GFX12-NEXT:    s_wait_alu 0xfffe
7559; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
7560; GFX12-NEXT:    s_cbranch_execnz .LBB21_4
7561; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
7562; GFX12-NEXT:    s_mov_b32 exec_lo, s2
7563; GFX12-NEXT:    s_wait_loadcnt 0x0
7564; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
7565; GFX12-NEXT:    v_mov_b32_e32 v6, v4
7566; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7567; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
7568; GFX12-NEXT:    s_wait_alu 0xfffe
7569; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
7570; GFX12-NEXT:    s_cbranch_execnz .LBB21_3
7571; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
7572; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
7573; GFX12-NEXT:    v_mov_b32_e32 v0, v4
7574; GFX12-NEXT:    s_wait_alu 0xfffe
7575; GFX12-NEXT:    s_setpc_b64 s[30:31]
7576;
7577; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
7578; GFX940:       ; %bb.0:
7579; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7580; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
7581; GFX940-NEXT:    s_mov_b64 s[2:3], exec
7582; GFX940-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
7583; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
7584; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
7585; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
7586; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
7587; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
7588; GFX940-NEXT:    s_nop 0
7589; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
7590; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
7591; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
7592; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
7593; GFX940-NEXT:    ; implicit-def: $vgpr4
7594; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
7595; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
7596; GFX940-NEXT:  ; %bb.2:
7597; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
7598; GFX940-NEXT:    s_mov_b64 s[2:3], 0
7599; GFX940-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
7600; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
7601; GFX940-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
7602; GFX940-NEXT:    s_mov_b32 s11, 0x7060302
7603; GFX940-NEXT:  .LBB21_3: ; %atomicrmw.start
7604; GFX940-NEXT:    ; =>This Loop Header: Depth=1
7605; GFX940-NEXT:    ; Child Loop BB21_4 Depth 2
7606; GFX940-NEXT:    s_waitcnt vmcnt(0)
7607; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
7608; GFX940-NEXT:    v_min_f32_e32 v4, v4, v9
7609; GFX940-NEXT:    v_bfe_u32 v5, v4, 16, 1
7610; GFX940-NEXT:    v_add3_u32 v5, v5, v4, s10
7611; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v4
7612; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
7613; GFX940-NEXT:    s_mov_b64 s[8:9], exec
7614; GFX940-NEXT:    buffer_wbl2 sc1
7615; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
7616; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
7617; GFX940-NEXT:    v_min_f32_e32 v5, v5, v10
7618; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
7619; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s10
7620; GFX940-NEXT:    v_or_b32_e32 v11, 0x400000, v5
7621; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
7622; GFX940-NEXT:    s_nop 1
7623; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
7624; GFX940-NEXT:    v_perm_b32 v6, v5, v4, s11
7625; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
7626; GFX940-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
7627; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
7628; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
7629; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
7630; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
7631; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
7632; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
7633; GFX940-NEXT:    s_nop 0
7634; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
7635; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
7636; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
7637; GFX940-NEXT:    s_waitcnt vmcnt(0)
7638; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
7639; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
7640; GFX940-NEXT:    s_cbranch_execnz .LBB21_4
7641; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
7642; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
7643; GFX940-NEXT:    s_waitcnt vmcnt(0)
7644; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
7645; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
7646; GFX940-NEXT:    v_mov_b32_e32 v7, v4
7647; GFX940-NEXT:    buffer_inv sc1
7648; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
7649; GFX940-NEXT:    s_cbranch_execnz .LBB21_3
7650; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
7651; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
7652; GFX940-NEXT:    v_mov_b32_e32 v0, v4
7653; GFX940-NEXT:    s_setpc_b64 s[30:31]
7654;
7655; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
7656; GFX11:       ; %bb.0:
7657; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7658; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
7659; GFX11-NEXT:    s_mov_b32 s1, 0
7660; GFX11-NEXT:    s_mov_b32 s2, exec_lo
7661; GFX11-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
7662; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
7663; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
7664; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
7665; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
7666; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
7667; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
7668; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
7669; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
7670; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
7671; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
7672; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
7673; GFX11-NEXT:    ; implicit-def: $vgpr4
7674; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
7675; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
7676; GFX11-NEXT:  ; %bb.2:
7677; GFX11-NEXT:    s_mov_b32 exec_lo, s2
7678; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
7679; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
7680; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
7681; GFX11-NEXT:    .p2align 6
7682; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.start
7683; GFX11-NEXT:    ; =>This Loop Header: Depth=1
7684; GFX11-NEXT:    ; Child Loop BB21_4 Depth 2
7685; GFX11-NEXT:    s_waitcnt vmcnt(0)
7686; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
7687; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
7688; GFX11-NEXT:    s_mov_b32 s2, exec_lo
7689; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7690; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7691; GFX11-NEXT:    v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8
7692; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
7693; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
7694; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
7695; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v4
7696; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
7697; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v5
7698; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
7699; GFX11-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
7700; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
7701; GFX11-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
7702; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
7703; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
7704; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7705; GFX11-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
7706; GFX11-NEXT:    v_mov_b32_e32 v4, v5
7707; GFX11-NEXT:    v_mov_b32_e32 v5, v6
7708; GFX11-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
7709; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
7710; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
7711; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
7712; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
7713; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
7714; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
7715; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
7716; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
7717; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
7718; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
7719; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
7720; GFX11-NEXT:    s_waitcnt vmcnt(0)
7721; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
7722; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
7723; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
7724; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
7725; GFX11-NEXT:    s_mov_b32 exec_lo, s2
7726; GFX11-NEXT:    s_waitcnt vmcnt(0)
7727; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
7728; GFX11-NEXT:    v_mov_b32_e32 v6, v4
7729; GFX11-NEXT:    buffer_gl1_inv
7730; GFX11-NEXT:    buffer_gl0_inv
7731; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
7732; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7733; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
7734; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
7735; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
7736; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
7737; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
7738; GFX11-NEXT:    v_mov_b32_e32 v0, v4
7739; GFX11-NEXT:    s_setpc_b64 s[30:31]
7740;
7741; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
7742; GFX10:       ; %bb.0:
7743; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7744; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
7745; GFX10-NEXT:    s_mov_b32 s5, 0
7746; GFX10-NEXT:    s_mov_b32 s6, exec_lo
7747; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
7748; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
7749; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
7750; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
7751; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
7752; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
7753; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
7754; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
7755; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
7756; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
7757; GFX10-NEXT:    ; implicit-def: $vgpr4
7758; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
7759; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
7760; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
7761; GFX10-NEXT:  ; %bb.2:
7762; GFX10-NEXT:    s_mov_b32 exec_lo, s6
7763; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
7764; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
7765; GFX10-NEXT:  .LBB21_3: ; %atomicrmw.start
7766; GFX10-NEXT:    ; =>This Loop Header: Depth=1
7767; GFX10-NEXT:    ; Child Loop BB21_4 Depth 2
7768; GFX10-NEXT:    s_waitcnt vmcnt(0)
7769; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
7770; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
7771; GFX10-NEXT:    s_mov_b32 s6, exec_lo
7772; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7773; GFX10-NEXT:    v_min_f32_e32 v4, v4, v8
7774; GFX10-NEXT:    v_min_f32_e32 v5, v5, v9
7775; GFX10-NEXT:    v_bfe_u32 v10, v4, 16, 1
7776; GFX10-NEXT:    v_bfe_u32 v11, v5, 16, 1
7777; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v4
7778; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
7779; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v5
7780; GFX10-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
7781; GFX10-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
7782; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
7783; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
7784; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
7785; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
7786; GFX10-NEXT:    v_mov_b32_e32 v4, v5
7787; GFX10-NEXT:    v_mov_b32_e32 v5, v6
7788; GFX10-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
7789; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
7790; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
7791; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
7792; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
7793; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
7794; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
7795; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
7796; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
7797; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
7798; GFX10-NEXT:    s_waitcnt vmcnt(0)
7799; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
7800; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
7801; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
7802; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
7803; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
7804; GFX10-NEXT:    s_mov_b32 exec_lo, s6
7805; GFX10-NEXT:    s_waitcnt vmcnt(0)
7806; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
7807; GFX10-NEXT:    v_mov_b32_e32 v6, v4
7808; GFX10-NEXT:    buffer_gl1_inv
7809; GFX10-NEXT:    buffer_gl0_inv
7810; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
7811; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
7812; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
7813; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
7814; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
7815; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
7816; GFX10-NEXT:    v_mov_b32_e32 v0, v4
7817; GFX10-NEXT:    s_setpc_b64 s[30:31]
7818;
7819; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
7820; GFX90A:       ; %bb.0:
7821; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7822; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
7823; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
7824; GFX90A-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
7825; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
7826; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
7827; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
7828; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
7829; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7830; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7831; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
7832; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
7833; GFX90A-NEXT:    s_nop 0
7834; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
7835; GFX90A-NEXT:    ; implicit-def: $vgpr4
7836; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
7837; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
7838; GFX90A-NEXT:  ; %bb.2:
7839; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
7840; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
7841; GFX90A-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
7842; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
7843; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
7844; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
7845; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.start
7846; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
7847; GFX90A-NEXT:    ; Child Loop BB21_4 Depth 2
7848; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7849; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
7850; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v9
7851; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
7852; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
7853; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
7854; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
7855; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
7856; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
7857; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v10
7858; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
7859; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s14
7860; GFX90A-NEXT:    v_or_b32_e32 v11, 0x400000, v5
7861; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
7862; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
7863; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
7864; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
7865; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
7866; GFX90A-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
7867; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
7868; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
7869; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
7870; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
7871; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
7872; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7873; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7874; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
7875; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
7876; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7877; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
7878; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
7879; GFX90A-NEXT:    s_cbranch_execnz .LBB21_4
7880; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
7881; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
7882; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7883; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
7884; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7885; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
7886; GFX90A-NEXT:    buffer_wbinvl1
7887; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7888; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
7889; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
7890; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
7891; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
7892; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7893;
7894; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
7895; GFX908:       ; %bb.0:
7896; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7897; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
7898; GFX908-NEXT:    s_mov_b64 s[6:7], exec
7899; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
7900; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
7901; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
7902; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
7903; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
7904; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7905; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7906; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
7907; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
7908; GFX908-NEXT:    s_nop 0
7909; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
7910; GFX908-NEXT:    ; implicit-def: $vgpr4
7911; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
7912; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
7913; GFX908-NEXT:  ; %bb.2:
7914; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
7915; GFX908-NEXT:    s_mov_b64 s[6:7], 0
7916; GFX908-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
7917; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
7918; GFX908-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
7919; GFX908-NEXT:    s_mov_b32 s15, 0x7060302
7920; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.start
7921; GFX908-NEXT:    ; =>This Loop Header: Depth=1
7922; GFX908-NEXT:    ; Child Loop BB21_4 Depth 2
7923; GFX908-NEXT:    s_waitcnt vmcnt(0)
7924; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
7925; GFX908-NEXT:    v_min_f32_e32 v4, v4, v8
7926; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
7927; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
7928; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v4
7929; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
7930; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
7931; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
7932; GFX908-NEXT:    v_min_f32_e32 v5, v5, v9
7933; GFX908-NEXT:    v_bfe_u32 v10, v5, 16, 1
7934; GFX908-NEXT:    v_add3_u32 v10, v10, v5, s14
7935; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v5
7936; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
7937; GFX908-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
7938; GFX908-NEXT:    v_perm_b32 v5, v5, v4, s15
7939; GFX908-NEXT:    v_mov_b32_e32 v4, v5
7940; GFX908-NEXT:    s_mov_b64 s[12:13], exec
7941; GFX908-NEXT:    v_mov_b32_e32 v5, v6
7942; GFX908-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
7943; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
7944; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
7945; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
7946; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
7947; GFX908-NEXT:    v_readfirstlane_b32 s11, v3
7948; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7949; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7950; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
7951; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
7952; GFX908-NEXT:    s_waitcnt vmcnt(0)
7953; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
7954; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
7955; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
7956; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
7957; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
7958; GFX908-NEXT:    s_waitcnt vmcnt(0)
7959; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
7960; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
7961; GFX908-NEXT:    v_mov_b32_e32 v6, v4
7962; GFX908-NEXT:    buffer_wbinvl1
7963; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
7964; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
7965; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
7966; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
7967; GFX908-NEXT:    v_mov_b32_e32 v0, v4
7968; GFX908-NEXT:    s_setpc_b64 s[30:31]
7969;
7970; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
7971; GFX8:       ; %bb.0:
7972; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7973; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
7974; GFX8-NEXT:    s_mov_b64 s[6:7], exec
7975; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
7976; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
7977; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
7978; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
7979; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
7980; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7981; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7982; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
7983; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
7984; GFX8-NEXT:    s_nop 0
7985; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
7986; GFX8-NEXT:    ; implicit-def: $vgpr4
7987; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
7988; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
7989; GFX8-NEXT:  ; %bb.2:
7990; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
7991; GFX8-NEXT:    s_mov_b64 s[6:7], 0
7992; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
7993; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
7994; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.start
7995; GFX8-NEXT:    ; =>This Loop Header: Depth=1
7996; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
7997; GFX8-NEXT:    s_waitcnt vmcnt(0)
7998; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
7999; GFX8-NEXT:    v_min_f32_e32 v4, v4, v8
8000; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
8001; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
8002; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
8003; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v4
8004; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
8005; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
8006; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
8007; GFX8-NEXT:    v_min_f32_e32 v5, v5, v9
8008; GFX8-NEXT:    v_bfe_u32 v10, v5, 16, 1
8009; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v5
8010; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
8011; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v5
8012; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
8013; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
8014; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
8015; GFX8-NEXT:    v_alignbit_b32 v5, v5, v4, 16
8016; GFX8-NEXT:    v_mov_b32_e32 v4, v5
8017; GFX8-NEXT:    s_mov_b64 s[12:13], exec
8018; GFX8-NEXT:    v_mov_b32_e32 v5, v6
8019; GFX8-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
8020; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
8021; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
8022; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
8023; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
8024; GFX8-NEXT:    v_readfirstlane_b32 s11, v3
8025; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
8026; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
8027; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
8028; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
8029; GFX8-NEXT:    s_waitcnt vmcnt(0)
8030; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
8031; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
8032; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
8033; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
8034; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
8035; GFX8-NEXT:    s_waitcnt vmcnt(0)
8036; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
8037; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
8038; GFX8-NEXT:    v_mov_b32_e32 v6, v4
8039; GFX8-NEXT:    buffer_wbinvl1
8040; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
8041; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
8042; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
8043; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
8044; GFX8-NEXT:    v_mov_b32_e32 v0, v4
8045; GFX8-NEXT:    s_setpc_b64 s[30:31]
8046;
8047; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
8048; GFX7:       ; %bb.0:
8049; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8050; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
8051; GFX7-NEXT:    s_mov_b64 s[6:7], exec
8052; GFX7-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
8053; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
8054; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
8055; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
8056; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
8057; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
8058; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
8059; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
8060; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
8061; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
8062; GFX7-NEXT:    ; implicit-def: $vgpr4
8063; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
8064; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
8065; GFX7-NEXT:  ; %bb.2:
8066; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
8067; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
8068; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
8069; GFX7-NEXT:    s_waitcnt vmcnt(0)
8070; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
8071; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
8072; GFX7-NEXT:    s_mov_b64 s[6:7], 0
8073; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
8074; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
8075; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.start
8076; GFX7-NEXT:    ; =>This Loop Header: Depth=1
8077; GFX7-NEXT:    ; Child Loop BB21_4 Depth 2
8078; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v4
8079; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
8080; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v7
8081; GFX7-NEXT:    v_min_f32_e32 v4, v4, v9
8082; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
8083; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
8084; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
8085; GFX7-NEXT:    v_min_f32_e32 v7, v7, v10
8086; GFX7-NEXT:    v_alignbit_b32 v5, v5, v6, 16
8087; GFX7-NEXT:    v_alignbit_b32 v4, v4, v7, 16
8088; GFX7-NEXT:    v_mov_b32_e32 v7, v5
8089; GFX7-NEXT:    s_mov_b64 s[12:13], exec
8090; GFX7-NEXT:    v_mov_b32_e32 v6, v4
8091; GFX7-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
8092; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
8093; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
8094; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
8095; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
8096; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
8097; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
8098; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
8099; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
8100; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
8101; GFX7-NEXT:    s_waitcnt vmcnt(0)
8102; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
8103; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
8104; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
8105; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
8106; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
8107; GFX7-NEXT:    s_waitcnt vmcnt(0)
8108; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
8109; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
8110; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
8111; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
8112; GFX7-NEXT:    buffer_wbinvl1
8113; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
8114; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
8115; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
8116; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
8117; GFX7-NEXT:    v_mov_b32_e32 v0, v7
8118; GFX7-NEXT:    v_mov_b32_e32 v1, v4
8119; GFX7-NEXT:    s_setpc_b64 s[30:31]
8120;
8121; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
8122; GFX6:       ; %bb.0:
8123; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8124; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
8125; GFX6-NEXT:    s_mov_b64 s[6:7], exec
8126; GFX6-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
8127; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
8128; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
8129; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
8130; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
8131; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
8132; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
8133; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
8134; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
8135; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
8136; GFX6-NEXT:    ; implicit-def: $vgpr4
8137; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
8138; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
8139; GFX6-NEXT:  ; %bb.2:
8140; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
8141; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v6
8142; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
8143; GFX6-NEXT:    s_waitcnt vmcnt(0)
8144; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
8145; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
8146; GFX6-NEXT:    s_mov_b64 s[6:7], 0
8147; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
8148; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
8149; GFX6-NEXT:  .LBB21_3: ; %atomicrmw.start
8150; GFX6-NEXT:    ; =>This Loop Header: Depth=1
8151; GFX6-NEXT:    ; Child Loop BB21_4 Depth 2
8152; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v4
8153; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
8154; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v7
8155; GFX6-NEXT:    v_min_f32_e32 v4, v4, v9
8156; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
8157; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
8158; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
8159; GFX6-NEXT:    v_min_f32_e32 v7, v7, v10
8160; GFX6-NEXT:    v_alignbit_b32 v5, v5, v6, 16
8161; GFX6-NEXT:    v_alignbit_b32 v4, v4, v7, 16
8162; GFX6-NEXT:    v_mov_b32_e32 v7, v5
8163; GFX6-NEXT:    s_mov_b64 s[12:13], exec
8164; GFX6-NEXT:    v_mov_b32_e32 v6, v4
8165; GFX6-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
8166; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
8167; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
8168; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
8169; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
8170; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
8171; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
8172; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
8173; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
8174; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
8175; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
8176; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
8177; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
8178; GFX6-NEXT:    s_cbranch_execnz .LBB21_4
8179; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
8180; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
8181; GFX6-NEXT:    s_waitcnt vmcnt(0)
8182; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
8183; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
8184; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
8185; GFX6-NEXT:    s_waitcnt expcnt(0)
8186; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
8187; GFX6-NEXT:    buffer_wbinvl1
8188; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
8189; GFX6-NEXT:    s_cbranch_execnz .LBB21_3
8190; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
8191; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
8192; GFX6-NEXT:    v_mov_b32_e32 v0, v7
8193; GFX6-NEXT:    v_mov_b32_e32 v1, v4
8194; GFX6-NEXT:    s_setpc_b64 s[30:31]
8195  %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
8196  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
8197  ret <2 x bfloat> %result
8198}
8199
8200; --------------------------------------------------------------------
8201; misc
8202; --------------------------------------------------------------------
8203
8204define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
8205; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
8206; GFX12:       ; %bb.0:
8207; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8208; GFX12-NEXT:    s_wait_expcnt 0x0
8209; GFX12-NEXT:    s_wait_samplecnt 0x0
8210; GFX12-NEXT:    s_wait_bvhcnt 0x0
8211; GFX12-NEXT:    s_wait_kmcnt 0x0
8212; GFX12-NEXT:    v_mov_b32_e32 v1, s16
8213; GFX12-NEXT:    global_wb scope:SCOPE_SYS
8214; GFX12-NEXT:    s_wait_storecnt 0x0
8215; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
8216; GFX12-NEXT:    s_wait_loadcnt 0x0
8217; GFX12-NEXT:    global_inv scope:SCOPE_SYS
8218; GFX12-NEXT:    s_setpc_b64 s[30:31]
8219;
8220; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
8221; GFX940:       ; %bb.0:
8222; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8223; GFX940-NEXT:    v_mov_b32_e32 v1, v0
8224; GFX940-NEXT:    v_mov_b32_e32 v0, s16
8225; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
8226; GFX940-NEXT:    s_add_i32 s6, s16, 0x400
8227; GFX940-NEXT:    s_mov_b64 s[4:5], 0
8228; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
8229; GFX940-NEXT:    v_mov_b32_e32 v3, s6
8230; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
8231; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
8232; GFX940-NEXT:    s_waitcnt vmcnt(0)
8233; GFX940-NEXT:    v_mov_b32_e32 v5, v0
8234; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
8235; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
8236; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
8237; GFX940-NEXT:    buffer_wbl2 sc0 sc1
8238; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
8239; GFX940-NEXT:    s_waitcnt vmcnt(0)
8240; GFX940-NEXT:    buffer_inv sc0 sc1
8241; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
8242; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8243; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8244; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
8245; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
8246; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
8247; GFX940-NEXT:    s_setpc_b64 s[30:31]
8248;
8249; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
8250; GFX11:       ; %bb.0:
8251; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8252; GFX11-NEXT:    v_mov_b32_e32 v1, s16
8253; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
8254; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
8255; GFX11-NEXT:    s_waitcnt vmcnt(0)
8256; GFX11-NEXT:    buffer_gl1_inv
8257; GFX11-NEXT:    buffer_gl0_inv
8258; GFX11-NEXT:    s_setpc_b64 s[30:31]
8259;
8260; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
8261; GFX10:       ; %bb.0:
8262; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8263; GFX10-NEXT:    v_mov_b32_e32 v1, s20
8264; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
8265; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc
8266; GFX10-NEXT:    s_waitcnt vmcnt(0)
8267; GFX10-NEXT:    buffer_gl1_inv
8268; GFX10-NEXT:    buffer_gl0_inv
8269; GFX10-NEXT:    s_setpc_b64 s[30:31]
8270;
8271; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
8272; GFX90A:       ; %bb.0:
8273; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8274; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
8275; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
8276; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
8277; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
8278; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
8279; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
8280; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
8281; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
8282; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
8283; GFX90A-NEXT:    s_waitcnt vmcnt(0)
8284; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
8285; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
8286; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
8287; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
8288; GFX90A-NEXT:    buffer_wbl2
8289; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
8290; GFX90A-NEXT:    s_waitcnt vmcnt(0)
8291; GFX90A-NEXT:    buffer_invl2
8292; GFX90A-NEXT:    buffer_wbinvl1
8293; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
8294; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8295; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8296; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
8297; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
8298; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
8299; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8300;
8301; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
8302; GFX908:       ; %bb.0:
8303; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8304; GFX908-NEXT:    v_mov_b32_e32 v1, v0
8305; GFX908-NEXT:    v_mov_b32_e32 v0, s20
8306; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
8307; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
8308; GFX908-NEXT:    s_mov_b64 s[4:5], 0
8309; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
8310; GFX908-NEXT:    v_mov_b32_e32 v3, s6
8311; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
8312; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
8313; GFX908-NEXT:    s_waitcnt vmcnt(0)
8314; GFX908-NEXT:    v_mov_b32_e32 v5, v0
8315; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
8316; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
8317; GFX908-NEXT:    v_mov_b32_e32 v0, v4
8318; GFX908-NEXT:    v_mov_b32_e32 v1, v5
8319; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
8320; GFX908-NEXT:    s_waitcnt vmcnt(0)
8321; GFX908-NEXT:    buffer_wbinvl1
8322; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
8323; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8324; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8325; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
8326; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
8327; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
8328; GFX908-NEXT:    s_setpc_b64 s[30:31]
8329;
8330; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
8331; GFX8:       ; %bb.0:
8332; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8333; GFX8-NEXT:    v_mov_b32_e32 v1, v0
8334; GFX8-NEXT:    v_mov_b32_e32 v0, s20
8335; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
8336; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
8337; GFX8-NEXT:    s_mov_b64 s[4:5], 0
8338; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
8339; GFX8-NEXT:    v_mov_b32_e32 v3, s6
8340; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
8341; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8342; GFX8-NEXT:    s_waitcnt vmcnt(0)
8343; GFX8-NEXT:    v_mov_b32_e32 v5, v0
8344; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
8345; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
8346; GFX8-NEXT:    v_mov_b32_e32 v0, v4
8347; GFX8-NEXT:    v_mov_b32_e32 v1, v5
8348; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
8349; GFX8-NEXT:    s_waitcnt vmcnt(0)
8350; GFX8-NEXT:    buffer_wbinvl1
8351; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
8352; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8353; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8354; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
8355; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8356; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8357; GFX8-NEXT:    s_setpc_b64 s[30:31]
8358;
8359; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
8360; GFX7:       ; %bb.0:
8361; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8362; GFX7-NEXT:    v_mov_b32_e32 v1, s20
8363; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc
8364; GFX7-NEXT:    s_waitcnt vmcnt(0)
8365; GFX7-NEXT:    buffer_wbinvl1
8366; GFX7-NEXT:    s_setpc_b64 s[30:31]
8367;
8368; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
8369; GFX6:       ; %bb.0:
8370; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8371; GFX6-NEXT:    v_mov_b32_e32 v1, s20
8372; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc
8373; GFX6-NEXT:    s_waitcnt vmcnt(0)
8374; GFX6-NEXT:    buffer_wbinvl1
8375; GFX6-NEXT:    s_waitcnt expcnt(0)
8376; GFX6-NEXT:    s_setpc_b64 s[30:31]
8377  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
8378  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
8379  ret float %result
8380}
8381
8382attributes #0 = { nounwind }
8383
8384!0 = !{}
8385